kestrelsnest-blog/migrate_images.py

#!/usr/bin/env python3
"""
Find and migrate images from old WordPress site to Hugo
"""

import os
import re
import shutil
from pathlib import Path
from urllib.parse import urlparse

# Directories
POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')
ARCHIVE_DIR = Path('/Users/ericwagoner/Sites/ericwagoner.com')
STATIC_DIR = Path('/Users/ericwagoner/Sites/blog/static')

def extract_image_urls():
    """Extract all image URLs from posts"""
    image_urls = set()

    for post in POSTS_DIR.glob('*.md'):
        with open(post, 'r', encoding='utf-8') as f:
            content = f.read()

            # Find markdown image links
            md_images = re.findall(r'!\[.*?\]\((http://www\.ericwagoner\.com/[^)]+(?:jpg|jpeg|gif|png))\)', content)
            image_urls.update(md_images)

            # Find HTML img tags
            html_images = re.findall(r'src="(http://www\.ericwagoner\.com/[^"]+(?:jpg|jpeg|gif|png))"', content)
            image_urls.update(html_images)

    return sorted(image_urls)

def find_image_in_archive(url):
    """Find the image file in the archive"""
    parsed = urlparse(url)
    path = parsed.path

    # Try direct path first
    full_path = ARCHIVE_DIR / path.lstrip('/')
    if full_path.exists():
        return full_path

    # Sometimes weblog is in the path but not in archive
    if '/weblog/' in path:
        alt_path = path.replace('/weblog/', '/', 1)
        full_path = ARCHIVE_DIR / alt_path.lstrip('/')
        if full_path.exists():
            return full_path

    return None

def copy_image_to_hugo(src_path, url):
    """Copy image to Hugo static directory and return new path"""
    parsed = urlparse(url)
    path = parsed.path

    # Create directory structure in static
    relative_path = path.lstrip('/')
    dest_path = STATIC_DIR / 'images' / 'legacy' / relative_path

    # Create parent directories
    dest_path.parent.mkdir(parents=True, exist_ok=True)

    # Copy the file
    try:
        shutil.copy2(src_path, dest_path)
        # Return the new URL path
        return f'/images/legacy/{relative_path}'
    except Exception as e:
        print(f"Error copying {src_path}: {e}")
        return None

def update_posts_with_new_urls(url_mapping):
    """Update all posts with new image URLs"""
    for post in POSTS_DIR.glob('*.md'):
        with open(post, 'r', encoding='utf-8') as f:
            content = f.read()

        modified = False
        for old_url, new_url in url_mapping.items():
            if old_url in content:
                # Escape special characters for regex
                escaped_old = re.escape(old_url)
                content = re.sub(escaped_old, new_url, content)
                modified = True

        if modified:
            with open(post, 'w', encoding='utf-8') as f:
                f.write(content)
            print(f"Updated {post.name}")

def main():
    """Main migration function"""
    print("Extracting image URLs from posts...")
    image_urls = extract_image_urls()
    print(f"Found {len(image_urls)} unique image URLs")

    url_mapping = {}
    found_count = 0
    not_found = []

    print("\nSearching for images in archive...")
    for url in image_urls:
        src_path = find_image_in_archive(url)
        if src_path:
            new_url = copy_image_to_hugo(src_path, url)
            if new_url:
                url_mapping[url] = new_url
                found_count += 1
                if found_count % 10 == 0:
                    print(f"Copied {found_count} images...")
        else:
            not_found.append(url)

    print(f"\n✅ Successfully copied {found_count} images")

    if not_found:
        print(f"⚠️  Could not find {len(not_found)} images:")
        for url in not_found[:5]:
            print(f"  - {url}")
        if len(not_found) > 5:
            print(f"  ... and {len(not_found) - 5} more")

    print("\nUpdating posts with new image URLs...")
    update_posts_with_new_urls(url_mapping)

    print(f"\n✅ Migration complete: {found_count} images migrated")

    return found_count, len(not_found)

if __name__ == "__main__":
    found, missing = main()