Import WordPress posts and migrate standalone content to Hugo

- Successfully imported 1731 WordPress posts to Hugo markdown format - Migrated 204+ images from archive to static directory - Copied standalone directories (curtain, farm, gobbler, house, images, party, revcemetery, railsday, birthday) - Fixed all internal links to use /legacy prefix for archived content - Remapped archive links to point to correct Hugo posts - Fixed Louisville Georgia Cemetery post rendering issue 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-23 16:23:40 -04:00
parent c1b41472ac
commit eddd9d2a80
2423 changed files with 36062 additions and 3 deletions
--- a/migrate_all_links.py
+++ b/migrate_all_links.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""
+Find and migrate all remaining ericwagoner.com links - images, pages, and internal blog links
+"""
+
+import os
+import re
+import shutil
+from pathlib import Path
+from collections import defaultdict
+
+POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')
+ARCHIVE_DIR = Path('/Users/ericwagoner/Sites/ericwagoner.com')
+STATIC_DIR = Path('/Users/ericwagoner/Sites/blog/static')
+
+def find_all_external_links():
+    """Find all links to ericwagoner.com"""
+    all_links = defaultdict(list)
+
+    for post in POSTS_DIR.glob('*.md'):
+        with open(post, 'r', encoding='utf-8') as f:
+            content = f.read()
+
+            # Find all links to ericwagoner.com
+            # Both markdown links and HTML links
+            patterns = [
+                r'\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)',  # Markdown links
+                r'href="(http://www\.ericwagoner\.com[^"]+)"',  # HTML links
+                r'src="(http://www\.ericwagoner\.com[^"]+)"',  # Image sources
+                r'!\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)',  # Image links
+            ]
+
+            for pattern in patterns:
+                for match in re.finditer(pattern, content):
+                    url = match.group(1) if 'href' in pattern or 'src' in pattern else match.group(2)
+                    all_links[url].append(post.name)
+
+    return all_links
+
+def categorize_links(links):
+    """Categorize links by type"""
+    categories = {
+        'images': {},
+        'blog_posts': {},
+        'html_pages': {},
+        'other': {}
+    }
+
+    for url in links:
+        if re.search(r'\.(jpg|jpeg|gif|png)$', url, re.IGNORECASE):
+            categories['images'][url] = links[url]
+        elif '/weblog/archives/' in url and url.endswith('.html'):
+            categories['blog_posts'][url] = links[url]
+        elif url.endswith('.html') or url.endswith('.htm'):
+            categories['html_pages'][url] = links[url]
+        else:
+            categories['other'][url] = links[url]
+
+    return categories
+
+def migrate_remaining_images(images):
+    """Copy remaining images to static"""
+    migrated = 0
+    not_found = []
+
+    for url in images:
+        # Parse URL path
+        path = url.replace('http://www.ericwagoner.com', '')
+
+        # Check if already migrated
+        dest_path = STATIC_DIR / 'images' / 'legacy' / path.lstrip('/')
+        if dest_path.exists():
+            continue
+
+        # Find in archive
+        src_path = ARCHIVE_DIR / path.lstrip('/')
+        if not src_path.exists() and '/weblog/' in path:
+            alt_path = path.replace('/weblog/', '/', 1)
+            src_path = ARCHIVE_DIR / alt_path.lstrip('/')
+
+        if src_path.exists():
+            dest_path.parent.mkdir(parents=True, exist_ok=True)
+            try:
+                shutil.copy2(src_path, dest_path)
+                migrated += 1
+                print(f"  Copied image: {path}")
+            except Exception as e:
+                print(f"  Error copying {src_path}: {e}")
+        else:
+            not_found.append(url)
+
+    return migrated, not_found
+
+def find_wordpress_post_id(url):
+    """Try to find WordPress post ID from URL"""
+    # Pattern for WordPress archives URLs
+    patterns = [
+        r'/archives/(\d+)\.html',  # /archives/001234.html
+        r'/\?p=(\d+)',  # /?p=1234
+        r'/archives/.*\.php\?id=(\d+)',  # /archives/something.php?id=1234
+    ]
+
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1).lstrip('0')  # Remove leading zeros
+
+    return None
+
+def migrate_html_pages(pages):
+    """Copy standalone HTML pages to static"""
+    migrated = 0
+    not_found = []
+
+    for url in pages:
+        path = url.replace('http://www.ericwagoner.com', '')
+
+        # Check if already exists
+        dest_path = STATIC_DIR / 'legacy' / path.lstrip('/')
+        if dest_path.exists():
+            continue
+
+        # Find in archive
+        src_path = ARCHIVE_DIR / path.lstrip('/')
+        if not src_path.exists() and '/weblog/' in path:
+            alt_path = path.replace('/weblog/', '/', 1)
+            src_path = ARCHIVE_DIR / alt_path.lstrip('/')
+
+        if src_path.exists():
+            dest_path.parent.mkdir(parents=True, exist_ok=True)
+            try:
+                shutil.copy2(src_path, dest_path)
+                migrated += 1
+                print(f"  Copied HTML page: {path}")
+            except Exception as e:
+                print(f"  Error copying {src_path}: {e}")
+        else:
+            not_found.append(url)
+
+    return migrated, not_found
+
+def update_links_in_posts(categorized):
+    """Update all links in posts"""
+    updates = 0
+
+    for post in POSTS_DIR.glob('*.md'):
+        with open(post, 'r', encoding='utf-8') as f:
+            content = f.read()
+
+        original_content = content
+
+        # Update image links
+        for url in categorized['images']:
+            path = url.replace('http://www.ericwagoner.com', '')
+            new_path = f'/images/legacy{path}'
+            content = content.replace(url, new_path)
+
+        # Update HTML page links
+        for url in categorized['html_pages']:
+            path = url.replace('http://www.ericwagoner.com', '')
+            new_path = f'/legacy{path}'
+            content = content.replace(url, new_path)
+
+        # Update internal blog post links (try to find matching posts)
+        for url in categorized['blog_posts']:
+            post_id = find_wordpress_post_id(url)
+            if post_id:
+                # Look for a post with this ID in the filename
+                matching_posts = list(POSTS_DIR.glob(f'*-{post_id}.md'))
+                if not matching_posts:
+                    # Try without leading zeros
+                    matching_posts = list(POSTS_DIR.glob(f'*-{int(post_id)}.md'))
+
+                if matching_posts:
+                    # Get the post slug
+                    post_name = matching_posts[0].stem
+                    new_url = f'/posts/{post_name}/'
+                    content = content.replace(url, new_url)
+                    print(f"  Linked to internal post: {url} -> {new_url}")
+
+        if content != original_content:
+            with open(post, 'w', encoding='utf-8') as f:
+                f.write(content)
+            updates += 1
+
+    return updates
+
+def main():
+    print("Finding all external links to ericwagoner.com...")
+    all_links = find_all_external_links()
+    print(f"Found {len(all_links)} unique URLs")
+
+    print("\nCategorizing links...")
+    categorized = categorize_links(all_links)
+
+    print(f"  Images: {len(categorized['images'])}")
+    print(f"  Blog posts: {len(categorized['blog_posts'])}")
+    print(f"  HTML pages: {len(categorized['html_pages'])}")
+    print(f"  Other: {len(categorized['other'])}")
+
+    print("\n=== Migrating remaining images ===")
+    img_migrated, img_not_found = migrate_remaining_images(categorized['images'])
+    print(f"✅ Migrated {img_migrated} images")
+    if img_not_found:
+        print(f"⚠️  Could not find {len(img_not_found)} images")
+
+    print("\n=== Migrating HTML pages ===")
+    html_migrated, html_not_found = migrate_html_pages(categorized['html_pages'])
+    print(f"✅ Migrated {html_migrated} HTML pages")
+    if html_not_found:
+        print(f"⚠️  Could not find {len(html_not_found)} HTML pages")
+
+    print("\n=== Updating links in posts ===")
+    updated = update_links_in_posts(categorized)
+    print(f"✅ Updated {updated} posts")
+
+    # Show sample of "other" links for review
+    if categorized['other']:
+        print("\n=== Other links that may need attention ===")
+        for url in list(categorized['other'].keys())[:10]:
+            print(f"  {url}")
+        if len(categorized['other']) > 10:
+            print(f"  ... and {len(categorized['other']) - 10} more")
+
+    print("\n✅ Migration complete!")
+
+    return len(all_links)
+
+if __name__ == "__main__":
+    main()