#!bin/python3 import argparse import os import re import requests import subprocess import tempfile import json from urllib.parse import unquote, urlparse from pathvalidate import sanitize_filename from canvasapi import Canvas from canvasapi.course import Course from canvasapi.exceptions import Unauthorized, ResourceDoesNotExist, Forbidden from canvasapi.file import File from canvasapi.module import Module, ModuleItem def extract_files(text): text_search = re.findall("/files/(\\d+)", text, re.IGNORECASE) groups = set(text_search) return groups def extract_media_objects(text): """Extract media object IDs from video tags and thumbnails""" # Pattern for media_objects/XXXXX in thumbnails and video elements (any format) media_pattern = re.findall(r"media_objects/([a-zA-Z0-9-]+)", text, re.IGNORECASE) # Pattern for data-media_comment_id attributes in video elements comment_pattern = re.findall(r'data-media_comment_id[=\\"]*([a-zA-Z0-9-]+)', text, re.IGNORECASE) # Pattern for media_attachments_iframe (Arc videos hidden in script tags) attachments_pattern = re.findall(r"media_attachments_iframe/(\d+)", text, re.IGNORECASE) # Also look for custom_arc_media_id patterns in iframe embeds (different format) arc_pattern = re.findall(r"custom_arc_media_id[=:%]([a-zA-Z0-9-]+)", text, re.IGNORECASE) # Special handling for Arc media IDs that start with 3D prefix arc_pattern = [mid[2:] if mid.startswith('3D') and len(mid) > 2 else mid for mid in arc_pattern] return set(media_pattern).union(set(comment_pattern)).union(set(attachments_pattern)).union(set(arc_pattern)) def extract_video_files(text): """Extract video file IDs from instructure_video_link elements""" # Pattern for video links with file IDs - handle JSON encoding and HTML entities video_pattern = re.findall(r'instructure_video_link[^>]*data-api-endpoint[=\\"]*[^"\'\\]*?/files/(\d+)', text, re.IGNORECASE) # Also look for video file IDs in different patterns file_pattern = re.findall(r'/files/(\d+)/download', text, re.IGNORECASE) return set(video_pattern).union(set(file_pattern)) def extract_video_titles(text): """Extract video titles and their associated media IDs""" video_titles = {} # Course 12329 pattern: \u003cli\u003e \u003cbr\u003e\u003cvideo...data-media_comment_id="<id>" li_pattern = re.findall(r'\\u003cli\\u003e([^\\]+?) \\u003cbr\\u003e\\u003cvideo[^>]*data-media_comment_id=\\"([^"]+)\\"', text, re.IGNORECASE) for title, media_id in li_pattern: # Clean up the title text - remove HTML entities clean_title = title.replace('\\u003ca ', '').replace('\\u003c/a\\u003e', '') clean_title = re.sub(r'\\u003c[^>]*\\u003e', '', clean_title) # Remove other HTML tags clean_title = clean_title.strip() if clean_title: video_titles[media_id] = clean_title # Course 10098 pattern: Look for instructure_video_link with title attributes video_file_pattern = re.findall(r'instructure_video_link.*?title=\\"([^"]+)\\".*?data-api-endpoint[^>]*?/files/(\d+)', text, re.IGNORECASE) for title, file_id in video_file_pattern: clean_title = title.replace('.mp4', '').replace('.avi', '').replace('_small', '') if clean_title: video_titles[file_id] = clean_title # Also look for video links with accompanying text (Part 1, Part 2, etc.) part_pattern = re.findall(r'instructure_video_link.*?data-api-endpoint[^>]*?/files/(\d+)[^>]*>([^<]+)<', text, re.IGNORECASE) for file_id, part_text in part_pattern: clean_text = part_text.strip().rstrip('.') if clean_text and file_id not in video_titles: video_titles[file_id] = clean_text return video_titles def generate_video_filename(media_id, video_titles, module_name, video_counter): """Generate a descriptive filename for a video""" # Try to get title from extracted titles if media_id in video_titles: title = video_titles[media_id] # Clean the title for filename use title = re.sub(r'[^\w\s-]', '', title) # Remove special chars except spaces and hyphens title = re.sub(r'\s+', '_', title) # Replace spaces with underscores title = title[:50] # Limit length return f"{video_counter:02d}_{title}.mp4" else: # Fallback to module name + counter clean_module = re.sub(r'[^\w\s-]', '', module_name) clean_module = re.sub(r'\s+', '_', clean_module) clean_module = clean_module[:30] return f"{video_counter:02d}_{clean_module}_video.mp4" def download_video_file(url, path, headers=None): """Download a video file from URL to local path""" try: response = requests.get(url, headers=headers, stream=True, allow_redirects=True) response.raise_for_status() os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) return True except Exception as e: print(f"Error downloading video {url}: {e}") return False def create_link_file(url, path, title=None): """Create a .link file pointing to the Canvas page URL""" try: os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'w', encoding='utf-8') as f: f.write("[InternetShortcut]\n") f.write(f"URL={url}\n") if title: f.write(f"; Title: {title}\n") return True except Exception as e: print(f"Error creating link file {path}: {e}") return False def get_course_files(course, canvas_url, canvas_token): modules = course.get_modules() files_downloaded = set() # Track downloaded files for this course to avoid duplicates videos_downloaded = set() # Track downloaded videos to avoid duplicates module_video_counter = {} # Track video counter per module for module in modules: module: Module = module module_items = module.get_module_items() for item in module_items: item: ModuleItem = item try: path = f"{output}/" \ f"{sanitize_filename(course.name)}/" \ f"{sanitize_filename(module.name)}/" except Exception as e: print(e) continue if not os.path.exists(path): os.makedirs(path) item_type = item.type print(f"{course.name} - " f"{module.name} - " f"{item.title} ({item_type})") if item_type == "File": file = canvas.get_file(item.content_id) files_downloaded.add(item.content_id) file.download(path + sanitize_filename(file.filename)) elif item_type == "Page": page = course.get_page(item.page_url) with open(path + sanitize_filename(item.title) + ".html", "w", encoding="utf-8") as f: f.write(page.body or "") files = extract_files(page.body or "") for file_id in files: if file_id in files_downloaded: continue try: file = course.get_file(file_id) files_downloaded.add(file_id) file.download(path + sanitize_filename(file.filename)) except ResourceDoesNotExist: pass # Download videos from media objects with better filenames media_objects = extract_media_objects(page.body or "") video_titles = extract_video_titles(page.body or "") # Initialize module counter if needed if module.name not in module_video_counter: module_video_counter[module.name] = 0 for media_id in media_objects: if media_id in videos_downloaded: continue try: # Check if this is an Arc UUID media ID (long UUID format) if '-' in media_id and len(media_id) > 20 and not media_id.startswith('m-'): # This is an Arc UUID video - create a link file instead print(f"Found Arc UUID video: {media_id}") # Increment counter and generate descriptive filename module_video_counter[module.name] += 1 video_filename = generate_video_filename( media_id, video_titles, module.name, module_video_counter[module.name] ).replace('.mp4', '.link') link_path = path + sanitize_filename(video_filename) # Create link file pointing to the page URL video_title = video_titles.get(media_id, item.title) print(f"Creating link file: {video_filename} -> {item.html_url}") if create_link_file(item.html_url, link_path, video_title): videos_downloaded.add(media_id) print(f"✅ Created link: {video_filename}") else: print(f"❌ Failed to create link: {video_filename}") continue # Check if this is a media attachment ID (numeric) if media_id.isdigit(): # This is likely a media_attachments ID from Arc video video_url = f"{canvas_url}/media_attachments/{media_id}/redirect" print(f"Found Arc video via media_attachments: {media_id}") else: # Standard Canvas media object video_url = f"{canvas_url}/media_objects/{media_id}/redirect?bitrate=127908" # Increment counter and generate descriptive filename module_video_counter[module.name] += 1 video_filename = generate_video_filename( media_id, video_titles, module.name, module_video_counter[module.name] ) video_path = path + sanitize_filename(video_filename) print(f"Downloading video: {media_id} -> {video_filename}") if download_video_file(video_url, video_path, headers={'Authorization': f'Bearer {canvas_token}'}): videos_downloaded.add(media_id) print(f"Downloaded: {video_filename}") else: print(f"Failed to download: {video_filename}") except Exception as e: print(f"Error processing video {media_id}: {e}") # Download videos from video file links with better filenames video_files = extract_video_files(page.body or "") for video_file_id in video_files: if video_file_id in files_downloaded: continue try: file = course.get_file(video_file_id) files_downloaded.add(video_file_id) # Generate descriptive filename for video files too module_video_counter[module.name] += 1 video_filename = generate_video_filename( video_file_id, video_titles, module.name, module_video_counter[module.name] ) video_path = path + sanitize_filename(video_filename) print(f"Downloading video file: {video_file_id} -> {video_filename}") file.download(video_path) print(f"Downloaded: {video_filename}") except ResourceDoesNotExist: print(f"Video file not found: {video_file_id}") except Exception as e: print(f"Error downloading video file {video_file_id}: {e}") elif item_type == "ExternalUrl": url = item.external_url with open(path + sanitize_filename(item.title) + ".url", "w") as f: f.write("[InternetShortcut]\n") f.write("URL=" + url) elif item_type == "Assignment": assignment = course.get_assignment(item.content_id) with open(path + sanitize_filename(item.title) + ".html", "w", encoding="utf-8") as f: f.write(assignment.description or "") files = extract_files(assignment.description or "") for file_id in files: if file_id in files_downloaded: continue try: file = course.get_file(file_id) files_downloaded.add(file_id) file.download(path + sanitize_filename(file.filename)) except ResourceDoesNotExist: pass except Unauthorized: pass except Forbidden: pass # Download videos from assignments as well media_objects = extract_media_objects(assignment.description or "") for media_id in media_objects: if media_id in videos_downloaded: continue try: # Check if this is an Arc UUID media ID (long UUID format) if '-' in media_id and len(media_id) > 20 and not media_id.startswith('m-'): # This is an Arc UUID video - create a link file instead print(f"Found Arc UUID video in assignment: {media_id}") video_filename = f"{media_id}.link" link_path = path + sanitize_filename(video_filename) # Create link file pointing to the assignment URL print(f"Creating link file: {video_filename} -> {assignment.html_url}") if create_link_file(assignment.html_url, link_path, assignment.name): videos_downloaded.add(media_id) print(f"✅ Created link: {video_filename}") else: print(f"❌ Failed to create link: {video_filename}") continue # Check if this is a media attachment ID (numeric) if media_id.isdigit(): # This is likely a media_attachments ID from Arc video video_url = f"{canvas_url}/media_attachments/{media_id}/redirect" print(f"Found Arc video in assignment via media_attachments: {media_id}") else: # Standard Canvas media object video_url = f"{canvas_url}/media_objects/{media_id}/redirect?bitrate=127908" video_filename = f"{media_id}.mp4" video_path = path + sanitize_filename(video_filename) print(f"Downloading video from assignment: {media_id}") if download_video_file(video_url, video_path, headers={'Authorization': f'Bearer {canvas_token}'}): videos_downloaded.add(media_id) print(f"Downloaded: {video_filename}") else: print(f"Failed to download: {video_filename}") except Exception as e: print(f"Error processing assignment video {media_id}: {e}") video_files = extract_video_files(assignment.description or "") for video_file_id in video_files: if video_file_id in files_downloaded: continue try: file = course.get_file(video_file_id) files_downloaded.add(video_file_id) print(f"Downloading assignment video file: {file.filename}") file.download(path + sanitize_filename(file.filename)) print(f"Downloaded: {file.filename}") except ResourceDoesNotExist: print(f"Assignment video file not found: {video_file_id}") except Exception as e: print(f"Error downloading assignment video file {video_file_id}: {e}") try: files = course.get_files() for file in files: file: File = file if not file.id in files_downloaded: print(f"{course.name} - {file.filename}") path = f"{output}/{sanitize_filename(course.name)}/" \ f"{sanitize_filename(file.filename)}" # Check if this is a video file by extension if file.filename and file.filename.lower().endswith(('.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv')): print(f"Found video file: {file.filename}") file.download(path) except Unauthorized: pass except Forbidden: pass if __name__ == "__main__": parser = argparse.ArgumentParser(description="Download all content from Canvas") parser.add_argument("url", help="URL to the Canvas website, e.g. https://canvas.utwente.nl") parser.add_argument("token", help="Token generated in the settings page on Canvas") parser.add_argument("output", help="Path to the output folder, e.g. output/") parser.add_argument("courses", help="Comma-separated course IDs or 'all'", nargs="?", const="all") args = parser.parse_args() # Handle args output = args.output.rstrip("/") + "/" if args.courses is None: args.courses = "all" print("No courses specified. Scraping all courses.") canvas = Canvas(args.url, args.token) courses = [] # courses to scrape # Select courses to scrape, default to all if args.courses != "all": courses = [] ids = args.courses.split(",") for id in ids: courses.append(canvas.get_course( int(id) )) else: courses = canvas.get_courses() # Perform scrape for course in courses: course: Course = course get_course_files(course, args.url, args.token)