Import WordPress posts and migrate standalone content to Hugo

- Successfully imported 1731 WordPress posts to Hugo markdown format - Migrated 204+ images from archive to static directory - Copied standalone directories (curtain, farm, gobbler, house, images, party, revcemetery, railsday, birthday) - Fixed all internal links to use /legacy prefix for archived content - Remapped archive links to point to correct Hugo posts - Fixed Louisville Georgia Cemetery post rendering issue 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-23 16:23:40 -04:00
parent c1b41472ac
commit eddd9d2a80
2423 changed files with 36062 additions and 3 deletions
--- a/migrate_images.py
+++ b/migrate_images.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""
+Find and migrate images from old WordPress site to Hugo
+"""
+
+import os
+import re
+import shutil
+from pathlib import Path
+from urllib.parse import urlparse
+
+# Directories
+POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')
+ARCHIVE_DIR = Path('/Users/ericwagoner/Sites/ericwagoner.com')
+STATIC_DIR = Path('/Users/ericwagoner/Sites/blog/static')
+
+def extract_image_urls():
+    """Extract all image URLs from posts"""
+    image_urls = set()
+
+    for post in POSTS_DIR.glob('*.md'):
+        with open(post, 'r', encoding='utf-8') as f:
+            content = f.read()
+
+            # Find markdown image links
+            md_images = re.findall(r'!\[.*?\]\((http://www\.ericwagoner\.com/[^)]+(?:jpg|jpeg|gif|png))\)', content)
+            image_urls.update(md_images)
+
+            # Find HTML img tags
+            html_images = re.findall(r'src="(http://www\.ericwagoner\.com/[^"]+(?:jpg|jpeg|gif|png))"', content)
+            image_urls.update(html_images)
+
+    return sorted(image_urls)
+
+def find_image_in_archive(url):
+    """Find the image file in the archive"""
+    parsed = urlparse(url)
+    path = parsed.path
+
+    # Try direct path first
+    full_path = ARCHIVE_DIR / path.lstrip('/')
+    if full_path.exists():
+        return full_path
+
+    # Sometimes weblog is in the path but not in archive
+    if '/weblog/' in path:
+        alt_path = path.replace('/weblog/', '/', 1)
+        full_path = ARCHIVE_DIR / alt_path.lstrip('/')
+        if full_path.exists():
+            return full_path
+
+    return None
+
+def copy_image_to_hugo(src_path, url):
+    """Copy image to Hugo static directory and return new path"""
+    parsed = urlparse(url)
+    path = parsed.path
+
+    # Create directory structure in static
+    relative_path = path.lstrip('/')
+    dest_path = STATIC_DIR / 'images' / 'legacy' / relative_path
+
+    # Create parent directories
+    dest_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Copy the file
+    try:
+        shutil.copy2(src_path, dest_path)
+        # Return the new URL path
+        return f'/images/legacy/{relative_path}'
+    except Exception as e:
+        print(f"Error copying {src_path}: {e}")
+        return None
+
+def update_posts_with_new_urls(url_mapping):
+    """Update all posts with new image URLs"""
+    for post in POSTS_DIR.glob('*.md'):
+        with open(post, 'r', encoding='utf-8') as f:
+            content = f.read()
+
+        modified = False
+        for old_url, new_url in url_mapping.items():
+            if old_url in content:
+                # Escape special characters for regex
+                escaped_old = re.escape(old_url)
+                content = re.sub(escaped_old, new_url, content)
+                modified = True
+
+        if modified:
+            with open(post, 'w', encoding='utf-8') as f:
+                f.write(content)
+            print(f"Updated {post.name}")
+
+def main():
+    """Main migration function"""
+    print("Extracting image URLs from posts...")
+    image_urls = extract_image_urls()
+    print(f"Found {len(image_urls)} unique image URLs")
+
+    url_mapping = {}
+    found_count = 0
+    not_found = []
+
+    print("\nSearching for images in archive...")
+    for url in image_urls:
+        src_path = find_image_in_archive(url)
+        if src_path:
+            new_url = copy_image_to_hugo(src_path, url)
+            if new_url:
+                url_mapping[url] = new_url
+                found_count += 1
+                if found_count % 10 == 0:
+                    print(f"Copied {found_count} images...")
+        else:
+            not_found.append(url)
+
+    print(f"\n✅ Successfully copied {found_count} images")
+
+    if not_found:
+        print(f"⚠️  Could not find {len(not_found)} images:")
+        for url in not_found[:5]:
+            print(f"  - {url}")
+        if len(not_found) > 5:
+            print(f"  ... and {len(not_found) - 5} more")
+
+    print("\nUpdating posts with new image URLs...")
+    update_posts_with_new_urls(url_mapping)
+
+    print(f"\n✅ Migration complete: {found_count} images migrated")
+
+    return found_count, len(not_found)
+
+if __name__ == "__main__":
+    found, missing = main()