#!bin/python3
import argparse
import os
import re
import requests
import subprocess
import tempfile
import json
from urllib.parse import unquote, urlparse
from pathvalidate import sanitize_filename
from canvasapi import Canvas
from canvasapi.course import Course
from canvasapi.exceptions import Unauthorized, ResourceDoesNotExist, Forbidden
from canvasapi.file import File
from canvasapi.module import Module, ModuleItem
def extract_files(text):
text_search = re.findall("/files/(\\d+)", text, re.IGNORECASE)
groups = set(text_search)
return groups
def extract_media_objects(text):
"""Extract media object IDs from video tags and thumbnails"""
# Pattern for media_objects/XXXXX in thumbnails and video elements (any format)
media_pattern = re.findall(r"media_objects/([a-zA-Z0-9-]+)", text, re.IGNORECASE)
# Pattern for data-media_comment_id attributes in video elements
comment_pattern = re.findall(r'data-media_comment_id[=\\"]*([a-zA-Z0-9-]+)', text, re.IGNORECASE)
# Pattern for media_attachments_iframe (Arc videos hidden in script tags)
attachments_pattern = re.findall(r"media_attachments_iframe/(\d+)", text, re.IGNORECASE)
# Also look for custom_arc_media_id patterns in iframe embeds (different format)
arc_pattern = re.findall(r"custom_arc_media_id[=:%]([a-zA-Z0-9-]+)", text, re.IGNORECASE)
# Special handling for Arc media IDs that start with 3D prefix
arc_pattern = [mid[2:] if mid.startswith('3D') and len(mid) > 2 else mid for mid in arc_pattern]
return set(media_pattern).union(set(comment_pattern)).union(set(attachments_pattern)).union(set(arc_pattern))
def extract_video_files(text):
"""Extract video file IDs from instructure_video_link elements"""
# Pattern for video links with file IDs - handle JSON encoding and HTML entities
video_pattern = re.findall(r'instructure_video_link[^>]*data-api-endpoint[=\\"]*[^"\'\\]*?/files/(\d+)', text, re.IGNORECASE)
# Also look for video file IDs in different patterns
file_pattern = re.findall(r'/files/(\d+)/download', text, re.IGNORECASE)
return set(video_pattern).union(set(file_pattern))
def extract_video_titles(text):
"""Extract video titles and their associated media IDs"""
video_titles = {}
# Course 12329 pattern: \u003cli\u003e
\u003cbr\u003e\u003cvideo...data-media_comment_id=""
li_pattern = re.findall(r'\\u003cli\\u003e([^\\]+?) \\u003cbr\\u003e\\u003cvideo[^>]*data-media_comment_id=\\"([^"]+)\\"', text, re.IGNORECASE)
for title, media_id in li_pattern:
# Clean up the title text - remove HTML entities
clean_title = title.replace('\\u003ca ', '').replace('\\u003c/a\\u003e', '')
clean_title = re.sub(r'\\u003c[^>]*\\u003e', '', clean_title) # Remove other HTML tags
clean_title = clean_title.strip()
if clean_title:
video_titles[media_id] = clean_title
# Course 10098 pattern: Look for instructure_video_link with title attributes
video_file_pattern = re.findall(r'instructure_video_link.*?title=\\"([^"]+)\\".*?data-api-endpoint[^>]*?/files/(\d+)', text, re.IGNORECASE)
for title, file_id in video_file_pattern:
clean_title = title.replace('.mp4', '').replace('.avi', '').replace('_small', '')
if clean_title:
video_titles[file_id] = clean_title
# Also look for video links with accompanying text (Part 1, Part 2, etc.)
part_pattern = re.findall(r'instructure_video_link.*?data-api-endpoint[^>]*?/files/(\d+)[^>]*>([^<]+)<', text, re.IGNORECASE)
for file_id, part_text in part_pattern:
clean_text = part_text.strip().rstrip('.')
if clean_text and file_id not in video_titles:
video_titles[file_id] = clean_text
return video_titles
def generate_video_filename(media_id, video_titles, module_name, video_counter):
"""Generate a descriptive filename for a video"""
# Try to get title from extracted titles
if media_id in video_titles:
title = video_titles[media_id]
# Clean the title for filename use
title = re.sub(r'[^\w\s-]', '', title) # Remove special chars except spaces and hyphens
title = re.sub(r'\s+', '_', title) # Replace spaces with underscores
title = title[:50] # Limit length
return f"{video_counter:02d}_{title}.mp4"
else:
# Fallback to module name + counter
clean_module = re.sub(r'[^\w\s-]', '', module_name)
clean_module = re.sub(r'\s+', '_', clean_module)
clean_module = clean_module[:30]
return f"{video_counter:02d}_{clean_module}_video.mp4"
def download_video_file(url, path, headers=None):
"""Download a video file from URL to local path"""
try:
response = requests.get(url, headers=headers, stream=True, allow_redirects=True)
response.raise_for_status()
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return True
except Exception as e:
print(f"Error downloading video {url}: {e}")
return False
def create_link_file(url, path, title=None):
"""Create a .link file pointing to the Canvas page URL"""
try:
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w', encoding='utf-8') as f:
f.write("[InternetShortcut]\n")
f.write(f"URL={url}\n")
if title:
f.write(f"; Title: {title}\n")
return True
except Exception as e:
print(f"Error creating link file {path}: {e}")
return False
def get_course_files(course, canvas_url, canvas_token):
modules = course.get_modules()
files_downloaded = set() # Track downloaded files for this course to avoid duplicates
videos_downloaded = set() # Track downloaded videos to avoid duplicates
module_video_counter = {} # Track video counter per module
for module in modules:
module: Module = module
module_items = module.get_module_items()
for item in module_items:
item: ModuleItem = item
try:
path = f"{output}/" \
f"{sanitize_filename(course.name)}/" \
f"{sanitize_filename(module.name)}/"
except Exception as e:
print(e)
continue
if not os.path.exists(path):
os.makedirs(path)
item_type = item.type
print(f"{course.name} - "
f"{module.name} - "
f"{item.title} ({item_type})")
if item_type == "File":
file = canvas.get_file(item.content_id)
files_downloaded.add(item.content_id)
file.download(path + sanitize_filename(file.filename))
elif item_type == "Page":
page = course.get_page(item.page_url)
with open(path + sanitize_filename(item.title) + ".html", "w", encoding="utf-8") as f:
f.write(page.body or "")
files = extract_files(page.body or "")
for file_id in files:
if file_id in files_downloaded:
continue
try:
file = course.get_file(file_id)
files_downloaded.add(file_id)
file.download(path + sanitize_filename(file.filename))
except ResourceDoesNotExist:
pass
# Download videos from media objects with better filenames
media_objects = extract_media_objects(page.body or "")
video_titles = extract_video_titles(page.body or "")
# Initialize module counter if needed
if module.name not in module_video_counter:
module_video_counter[module.name] = 0
for media_id in media_objects:
if media_id in videos_downloaded:
continue
try:
# Check if this is an Arc UUID media ID (long UUID format)
if '-' in media_id and len(media_id) > 20 and not media_id.startswith('m-'):
# This is an Arc UUID video - create a link file instead
print(f"Found Arc UUID video: {media_id}")
# Increment counter and generate descriptive filename
module_video_counter[module.name] += 1
video_filename = generate_video_filename(
media_id, video_titles, module.name, module_video_counter[module.name]
).replace('.mp4', '.link')
link_path = path + sanitize_filename(video_filename)
# Create link file pointing to the page URL
video_title = video_titles.get(media_id, item.title)
print(f"Creating link file: {video_filename} -> {item.html_url}")
if create_link_file(item.html_url, link_path, video_title):
videos_downloaded.add(media_id)
print(f"✅ Created link: {video_filename}")
else:
print(f"❌ Failed to create link: {video_filename}")
continue
# Check if this is a media attachment ID (numeric)
if media_id.isdigit():
# This is likely a media_attachments ID from Arc video
video_url = f"{canvas_url}/media_attachments/{media_id}/redirect"
print(f"Found Arc video via media_attachments: {media_id}")
else:
# Standard Canvas media object
video_url = f"{canvas_url}/media_objects/{media_id}/redirect?bitrate=127908"
# Increment counter and generate descriptive filename
module_video_counter[module.name] += 1
video_filename = generate_video_filename(
media_id, video_titles, module.name, module_video_counter[module.name]
)
video_path = path + sanitize_filename(video_filename)
print(f"Downloading video: {media_id} -> {video_filename}")
if download_video_file(video_url, video_path, headers={'Authorization': f'Bearer {canvas_token}'}):
videos_downloaded.add(media_id)
print(f"Downloaded: {video_filename}")
else:
print(f"Failed to download: {video_filename}")
except Exception as e:
print(f"Error processing video {media_id}: {e}")
# Download videos from video file links with better filenames
video_files = extract_video_files(page.body or "")
for video_file_id in video_files:
if video_file_id in files_downloaded:
continue
try:
file = course.get_file(video_file_id)
files_downloaded.add(video_file_id)
# Generate descriptive filename for video files too
module_video_counter[module.name] += 1
video_filename = generate_video_filename(
video_file_id, video_titles, module.name, module_video_counter[module.name]
)
video_path = path + sanitize_filename(video_filename)
print(f"Downloading video file: {video_file_id} -> {video_filename}")
file.download(video_path)
print(f"Downloaded: {video_filename}")
except ResourceDoesNotExist:
print(f"Video file not found: {video_file_id}")
except Exception as e:
print(f"Error downloading video file {video_file_id}: {e}")
elif item_type == "ExternalUrl":
url = item.external_url
with open(path + sanitize_filename(item.title) + ".url", "w") as f:
f.write("[InternetShortcut]\n")
f.write("URL=" + url)
elif item_type == "Assignment":
assignment = course.get_assignment(item.content_id)
with open(path + sanitize_filename(item.title) + ".html", "w", encoding="utf-8") as f:
f.write(assignment.description or "")
files = extract_files(assignment.description or "")
for file_id in files:
if file_id in files_downloaded:
continue
try:
file = course.get_file(file_id)
files_downloaded.add(file_id)
file.download(path + sanitize_filename(file.filename))
except ResourceDoesNotExist:
pass
except Unauthorized:
pass
except Forbidden:
pass
# Download videos from assignments as well
media_objects = extract_media_objects(assignment.description or "")
for media_id in media_objects:
if media_id in videos_downloaded:
continue
try:
# Check if this is an Arc UUID media ID (long UUID format)
if '-' in media_id and len(media_id) > 20 and not media_id.startswith('m-'):
# This is an Arc UUID video - create a link file instead
print(f"Found Arc UUID video in assignment: {media_id}")
video_filename = f"{media_id}.link"
link_path = path + sanitize_filename(video_filename)
# Create link file pointing to the assignment URL
print(f"Creating link file: {video_filename} -> {assignment.html_url}")
if create_link_file(assignment.html_url, link_path, assignment.name):
videos_downloaded.add(media_id)
print(f"✅ Created link: {video_filename}")
else:
print(f"❌ Failed to create link: {video_filename}")
continue
# Check if this is a media attachment ID (numeric)
if media_id.isdigit():
# This is likely a media_attachments ID from Arc video
video_url = f"{canvas_url}/media_attachments/{media_id}/redirect"
print(f"Found Arc video in assignment via media_attachments: {media_id}")
else:
# Standard Canvas media object
video_url = f"{canvas_url}/media_objects/{media_id}/redirect?bitrate=127908"
video_filename = f"{media_id}.mp4"
video_path = path + sanitize_filename(video_filename)
print(f"Downloading video from assignment: {media_id}")
if download_video_file(video_url, video_path, headers={'Authorization': f'Bearer {canvas_token}'}):
videos_downloaded.add(media_id)
print(f"Downloaded: {video_filename}")
else:
print(f"Failed to download: {video_filename}")
except Exception as e:
print(f"Error processing assignment video {media_id}: {e}")
video_files = extract_video_files(assignment.description or "")
for video_file_id in video_files:
if video_file_id in files_downloaded:
continue
try:
file = course.get_file(video_file_id)
files_downloaded.add(video_file_id)
print(f"Downloading assignment video file: {file.filename}")
file.download(path + sanitize_filename(file.filename))
print(f"Downloaded: {file.filename}")
except ResourceDoesNotExist:
print(f"Assignment video file not found: {video_file_id}")
except Exception as e:
print(f"Error downloading assignment video file {video_file_id}: {e}")
try:
files = course.get_files()
for file in files:
file: File = file
if not file.id in files_downloaded:
print(f"{course.name} - {file.filename}")
path = f"{output}/{sanitize_filename(course.name)}/" \
f"{sanitize_filename(file.filename)}"
# Check if this is a video file by extension
if file.filename and file.filename.lower().endswith(('.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv')):
print(f"Found video file: {file.filename}")
file.download(path)
except Unauthorized:
pass
except Forbidden:
pass
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download all content from Canvas")
parser.add_argument("url", help="URL to the Canvas website, e.g. https://canvas.utwente.nl")
parser.add_argument("token", help="Token generated in the settings page on Canvas")
parser.add_argument("output", help="Path to the output folder, e.g. output/")
parser.add_argument("courses", help="Comma-separated course IDs or 'all'", nargs="?", const="all")
args = parser.parse_args()
# Handle args
output = args.output.rstrip("/") + "/"
if args.courses is None:
args.courses = "all"
print("No courses specified. Scraping all courses.")
canvas = Canvas(args.url, args.token)
courses = [] # courses to scrape
# Select courses to scrape, default to all
if args.courses != "all":
courses = []
ids = args.courses.split(",")
for id in ids:
courses.append(canvas.get_course( int(id) ))
else:
courses = canvas.get_courses()
# Perform scrape
for course in courses:
course: Course = course
get_course_files(course, args.url, args.token)