#!/usr/bin/env python3 """ Find and migrate all remaining ericwagoner.com links - images, pages, and internal blog links """ import os import re import shutil from pathlib import Path from collections import defaultdict POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts') ARCHIVE_DIR = Path('/Users/ericwagoner/Sites/ericwagoner.com') STATIC_DIR = Path('/Users/ericwagoner/Sites/blog/static') def find_all_external_links(): """Find all links to ericwagoner.com""" all_links = defaultdict(list) for post in POSTS_DIR.glob('*.md'): with open(post, 'r', encoding='utf-8') as f: content = f.read() # Find all links to ericwagoner.com # Both markdown links and HTML links patterns = [ r'\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)', # Markdown links r'href="(http://www\.ericwagoner\.com[^"]+)"', # HTML links r'src="(http://www\.ericwagoner\.com[^"]+)"', # Image sources r'!\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)', # Image links ] for pattern in patterns: for match in re.finditer(pattern, content): url = match.group(1) if 'href' in pattern or 'src' in pattern else match.group(2) all_links[url].append(post.name) return all_links def categorize_links(links): """Categorize links by type""" categories = { 'images': {}, 'blog_posts': {}, 'html_pages': {}, 'other': {} } for url in links: if re.search(r'\.(jpg|jpeg|gif|png)$', url, re.IGNORECASE): categories['images'][url] = links[url] elif '/weblog/archives/' in url and url.endswith('.html'): categories['blog_posts'][url] = links[url] elif url.endswith('.html') or url.endswith('.htm'): categories['html_pages'][url] = links[url] else: categories['other'][url] = links[url] return categories def migrate_remaining_images(images): """Copy remaining images to static""" migrated = 0 not_found = [] for url in images: # Parse URL path path = url.replace('http://www.ericwagoner.com', '') # Check if already migrated dest_path = STATIC_DIR / 'images' / 'legacy' / path.lstrip('/') if dest_path.exists(): continue # Find in archive src_path = ARCHIVE_DIR / path.lstrip('/') if not src_path.exists() and '/weblog/' in path: alt_path = path.replace('/weblog/', '/', 1) src_path = ARCHIVE_DIR / alt_path.lstrip('/') if src_path.exists(): dest_path.parent.mkdir(parents=True, exist_ok=True) try: shutil.copy2(src_path, dest_path) migrated += 1 print(f" Copied image: {path}") except Exception as e: print(f" Error copying {src_path}: {e}") else: not_found.append(url) return migrated, not_found def find_wordpress_post_id(url): """Try to find WordPress post ID from URL""" # Pattern for WordPress archives URLs patterns = [ r'/archives/(\d+)\.html', # /archives/001234.html r'/\?p=(\d+)', # /?p=1234 r'/archives/.*\.php\?id=(\d+)', # /archives/something.php?id=1234 ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1).lstrip('0') # Remove leading zeros return None def migrate_html_pages(pages): """Copy standalone HTML pages to static""" migrated = 0 not_found = [] for url in pages: path = url.replace('http://www.ericwagoner.com', '') # Check if already exists dest_path = STATIC_DIR / 'legacy' / path.lstrip('/') if dest_path.exists(): continue # Find in archive src_path = ARCHIVE_DIR / path.lstrip('/') if not src_path.exists() and '/weblog/' in path: alt_path = path.replace('/weblog/', '/', 1) src_path = ARCHIVE_DIR / alt_path.lstrip('/') if src_path.exists(): dest_path.parent.mkdir(parents=True, exist_ok=True) try: shutil.copy2(src_path, dest_path) migrated += 1 print(f" Copied HTML page: {path}") except Exception as e: print(f" Error copying {src_path}: {e}") else: not_found.append(url) return migrated, not_found def update_links_in_posts(categorized): """Update all links in posts""" updates = 0 for post in POSTS_DIR.glob('*.md'): with open(post, 'r', encoding='utf-8') as f: content = f.read() original_content = content # Update image links for url in categorized['images']: path = url.replace('http://www.ericwagoner.com', '') new_path = f'/images/legacy{path}' content = content.replace(url, new_path) # Update HTML page links for url in categorized['html_pages']: path = url.replace('http://www.ericwagoner.com', '') new_path = f'/legacy{path}' content = content.replace(url, new_path) # Update internal blog post links (try to find matching posts) for url in categorized['blog_posts']: post_id = find_wordpress_post_id(url) if post_id: # Look for a post with this ID in the filename matching_posts = list(POSTS_DIR.glob(f'*-{post_id}.md')) if not matching_posts: # Try without leading zeros matching_posts = list(POSTS_DIR.glob(f'*-{int(post_id)}.md')) if matching_posts: # Get the post slug post_name = matching_posts[0].stem new_url = f'/posts/{post_name}/' content = content.replace(url, new_url) print(f" Linked to internal post: {url} -> {new_url}") if content != original_content: with open(post, 'w', encoding='utf-8') as f: f.write(content) updates += 1 return updates def main(): print("Finding all external links to ericwagoner.com...") all_links = find_all_external_links() print(f"Found {len(all_links)} unique URLs") print("\nCategorizing links...") categorized = categorize_links(all_links) print(f" Images: {len(categorized['images'])}") print(f" Blog posts: {len(categorized['blog_posts'])}") print(f" HTML pages: {len(categorized['html_pages'])}") print(f" Other: {len(categorized['other'])}") print("\n=== Migrating remaining images ===") img_migrated, img_not_found = migrate_remaining_images(categorized['images']) print(f"✅ Migrated {img_migrated} images") if img_not_found: print(f"⚠️ Could not find {len(img_not_found)} images") print("\n=== Migrating HTML pages ===") html_migrated, html_not_found = migrate_html_pages(categorized['html_pages']) print(f"✅ Migrated {html_migrated} HTML pages") if html_not_found: print(f"⚠️ Could not find {len(html_not_found)} HTML pages") print("\n=== Updating links in posts ===") updated = update_links_in_posts(categorized) print(f"✅ Updated {updated} posts") # Show sample of "other" links for review if categorized['other']: print("\n=== Other links that may need attention ===") for url in list(categorized['other'].keys())[:10]: print(f" {url}") if len(categorized['other']) > 10: print(f" ... and {len(categorized['other']) - 10} more") print("\n✅ Migration complete!") return len(all_links) if __name__ == "__main__": main()