kestrelsnest-blog/migrate_all_links.py

#!/usr/bin/env python3
"""
Find and migrate all remaining ericwagoner.com links - images, pages, and internal blog links
"""

import os
import re
import shutil
from pathlib import Path
from collections import defaultdict

POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')
ARCHIVE_DIR = Path('/Users/ericwagoner/Sites/ericwagoner.com')
STATIC_DIR = Path('/Users/ericwagoner/Sites/blog/static')

def find_all_external_links():
    """Find all links to ericwagoner.com"""
    all_links = defaultdict(list)

    for post in POSTS_DIR.glob('*.md'):
        with open(post, 'r', encoding='utf-8') as f:
            content = f.read()

            # Find all links to ericwagoner.com
            # Both markdown links and HTML links
            patterns = [
                r'\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)',  # Markdown links
                r'href="(http://www\.ericwagoner\.com[^"]+)"',  # HTML links
                r'src="(http://www\.ericwagoner\.com[^"]+)"',  # Image sources
                r'!\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)',  # Image links
            ]

            for pattern in patterns:
                for match in re.finditer(pattern, content):
                    url = match.group(1) if 'href' in pattern or 'src' in pattern else match.group(2)
                    all_links[url].append(post.name)

    return all_links

def categorize_links(links):
    """Categorize links by type"""
    categories = {
        'images': {},
        'blog_posts': {},
        'html_pages': {},
        'other': {}
    }

    for url in links:
        if re.search(r'\.(jpg|jpeg|gif|png)$', url, re.IGNORECASE):
            categories['images'][url] = links[url]
        elif '/weblog/archives/' in url and url.endswith('.html'):
            categories['blog_posts'][url] = links[url]
        elif url.endswith('.html') or url.endswith('.htm'):
            categories['html_pages'][url] = links[url]
        else:
            categories['other'][url] = links[url]

    return categories

def migrate_remaining_images(images):
    """Copy remaining images to static"""
    migrated = 0
    not_found = []

    for url in images:
        # Parse URL path
        path = url.replace('http://www.ericwagoner.com', '')

        # Check if already migrated
        dest_path = STATIC_DIR / 'images' / 'legacy' / path.lstrip('/')
        if dest_path.exists():
            continue

        # Find in archive
        src_path = ARCHIVE_DIR / path.lstrip('/')
        if not src_path.exists() and '/weblog/' in path:
            alt_path = path.replace('/weblog/', '/', 1)
            src_path = ARCHIVE_DIR / alt_path.lstrip('/')

        if src_path.exists():
            dest_path.parent.mkdir(parents=True, exist_ok=True)
            try:
                shutil.copy2(src_path, dest_path)
                migrated += 1
                print(f"  Copied image: {path}")
            except Exception as e:
                print(f"  Error copying {src_path}: {e}")
        else:
            not_found.append(url)

    return migrated, not_found

def find_wordpress_post_id(url):
    """Try to find WordPress post ID from URL"""
    # Pattern for WordPress archives URLs
    patterns = [
        r'/archives/(\d+)\.html',  # /archives/001234.html
        r'/\?p=(\d+)',  # /?p=1234
        r'/archives/.*\.php\?id=(\d+)',  # /archives/something.php?id=1234
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1).lstrip('0')  # Remove leading zeros

    return None

def migrate_html_pages(pages):
    """Copy standalone HTML pages to static"""
    migrated = 0
    not_found = []

    for url in pages:
        path = url.replace('http://www.ericwagoner.com', '')

        # Check if already exists
        dest_path = STATIC_DIR / 'legacy' / path.lstrip('/')
        if dest_path.exists():
            continue

        # Find in archive
        src_path = ARCHIVE_DIR / path.lstrip('/')
        if not src_path.exists() and '/weblog/' in path:
            alt_path = path.replace('/weblog/', '/', 1)
            src_path = ARCHIVE_DIR / alt_path.lstrip('/')

        if src_path.exists():
            dest_path.parent.mkdir(parents=True, exist_ok=True)
            try:
                shutil.copy2(src_path, dest_path)
                migrated += 1
                print(f"  Copied HTML page: {path}")
            except Exception as e:
                print(f"  Error copying {src_path}: {e}")
        else:
            not_found.append(url)

    return migrated, not_found

def update_links_in_posts(categorized):
    """Update all links in posts"""
    updates = 0

    for post in POSTS_DIR.glob('*.md'):
        with open(post, 'r', encoding='utf-8') as f:
            content = f.read()

        original_content = content

        # Update image links
        for url in categorized['images']:
            path = url.replace('http://www.ericwagoner.com', '')
            new_path = f'/images/legacy{path}'
            content = content.replace(url, new_path)

        # Update HTML page links
        for url in categorized['html_pages']:
            path = url.replace('http://www.ericwagoner.com', '')
            new_path = f'/legacy{path}'
            content = content.replace(url, new_path)

        # Update internal blog post links (try to find matching posts)
        for url in categorized['blog_posts']:
            post_id = find_wordpress_post_id(url)
            if post_id:
                # Look for a post with this ID in the filename
                matching_posts = list(POSTS_DIR.glob(f'*-{post_id}.md'))
                if not matching_posts:
                    # Try without leading zeros
                    matching_posts = list(POSTS_DIR.glob(f'*-{int(post_id)}.md'))

                if matching_posts:
                    # Get the post slug
                    post_name = matching_posts[0].stem
                    new_url = f'/posts/{post_name}/'
                    content = content.replace(url, new_url)
                    print(f"  Linked to internal post: {url} -> {new_url}")

        if content != original_content:
            with open(post, 'w', encoding='utf-8') as f:
                f.write(content)
            updates += 1

    return updates

def main():
    print("Finding all external links to ericwagoner.com...")
    all_links = find_all_external_links()
    print(f"Found {len(all_links)} unique URLs")

    print("\nCategorizing links...")
    categorized = categorize_links(all_links)

    print(f"  Images: {len(categorized['images'])}")
    print(f"  Blog posts: {len(categorized['blog_posts'])}")
    print(f"  HTML pages: {len(categorized['html_pages'])}")
    print(f"  Other: {len(categorized['other'])}")

    print("\n=== Migrating remaining images ===")
    img_migrated, img_not_found = migrate_remaining_images(categorized['images'])
    print(f"✅ Migrated {img_migrated} images")
    if img_not_found:
        print(f"⚠️  Could not find {len(img_not_found)} images")

    print("\n=== Migrating HTML pages ===")
    html_migrated, html_not_found = migrate_html_pages(categorized['html_pages'])
    print(f"✅ Migrated {html_migrated} HTML pages")
    if html_not_found:
        print(f"⚠️  Could not find {len(html_not_found)} HTML pages")

    print("\n=== Updating links in posts ===")
    updated = update_links_in_posts(categorized)
    print(f"✅ Updated {updated} posts")

    # Show sample of "other" links for review
    if categorized['other']:
        print("\n=== Other links that may need attention ===")
        for url in list(categorized['other'].keys())[:10]:
            print(f"  {url}")
        if len(categorized['other']) > 10:
            print(f"  ... and {len(categorized['other']) - 10} more")

    print("\n✅ Migration complete!")

    return len(all_links)

if __name__ == "__main__":
    main()