#!/usr/bin/env python3 """ Find and migrate images from old WordPress site to Hugo """ import os import re import shutil from pathlib import Path from urllib.parse import urlparse # Directories POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts') ARCHIVE_DIR = Path('/Users/ericwagoner/Sites/ericwagoner.com') STATIC_DIR = Path('/Users/ericwagoner/Sites/blog/static') def extract_image_urls(): """Extract all image URLs from posts""" image_urls = set() for post in POSTS_DIR.glob('*.md'): with open(post, 'r', encoding='utf-8') as f: content = f.read() # Find markdown image links md_images = re.findall(r'!\[.*?\]\((http://www\.ericwagoner\.com/[^)]+(?:jpg|jpeg|gif|png))\)', content) image_urls.update(md_images) # Find HTML img tags html_images = re.findall(r'src="(http://www\.ericwagoner\.com/[^"]+(?:jpg|jpeg|gif|png))"', content) image_urls.update(html_images) return sorted(image_urls) def find_image_in_archive(url): """Find the image file in the archive""" parsed = urlparse(url) path = parsed.path # Try direct path first full_path = ARCHIVE_DIR / path.lstrip('/') if full_path.exists(): return full_path # Sometimes weblog is in the path but not in archive if '/weblog/' in path: alt_path = path.replace('/weblog/', '/', 1) full_path = ARCHIVE_DIR / alt_path.lstrip('/') if full_path.exists(): return full_path return None def copy_image_to_hugo(src_path, url): """Copy image to Hugo static directory and return new path""" parsed = urlparse(url) path = parsed.path # Create directory structure in static relative_path = path.lstrip('/') dest_path = STATIC_DIR / 'images' / 'legacy' / relative_path # Create parent directories dest_path.parent.mkdir(parents=True, exist_ok=True) # Copy the file try: shutil.copy2(src_path, dest_path) # Return the new URL path return f'/images/legacy/{relative_path}' except Exception as e: print(f"Error copying {src_path}: {e}") return None def update_posts_with_new_urls(url_mapping): """Update all posts with new image URLs""" for post in POSTS_DIR.glob('*.md'): with open(post, 'r', encoding='utf-8') as f: content = f.read() modified = False for old_url, new_url in url_mapping.items(): if old_url in content: # Escape special characters for regex escaped_old = re.escape(old_url) content = re.sub(escaped_old, new_url, content) modified = True if modified: with open(post, 'w', encoding='utf-8') as f: f.write(content) print(f"Updated {post.name}") def main(): """Main migration function""" print("Extracting image URLs from posts...") image_urls = extract_image_urls() print(f"Found {len(image_urls)} unique image URLs") url_mapping = {} found_count = 0 not_found = [] print("\nSearching for images in archive...") for url in image_urls: src_path = find_image_in_archive(url) if src_path: new_url = copy_image_to_hugo(src_path, url) if new_url: url_mapping[url] = new_url found_count += 1 if found_count % 10 == 0: print(f"Copied {found_count} images...") else: not_found.append(url) print(f"\n✅ Successfully copied {found_count} images") if not_found: print(f"⚠️ Could not find {len(not_found)} images:") for url in not_found[:5]: print(f" - {url}") if len(not_found) > 5: print(f" ... and {len(not_found) - 5} more") print("\nUpdating posts with new image URLs...") update_posts_with_new_urls(url_mapping) print(f"\n✅ Migration complete: {found_count} images migrated") return found_count, len(not_found) if __name__ == "__main__": found, missing = main()