kestrelsnest-blog/remap_archive_links.py

#!/usr/bin/env python3
"""
Remap old WordPress archive links to new Hugo post URLs
"""

import re
from pathlib import Path
from datetime import datetime

POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')

def build_post_mapping():
    """Build mapping from old WordPress IDs/dates to new Hugo URLs"""
    mapping = {}

    # Map by WordPress post ID (from filename)
    for post in POSTS_DIR.glob('*.md'):
        # Extract post ID if it exists in filename (e.g., "2003-11-18-first-image-1234.md")
        match = re.search(r'-(\d+)\.md$', post.name)
        if match:
            post_id = match.group(1)
            hugo_url = f'/posts/{post.stem}/'
            # WordPress archive format: /archives/000123.html
            mapping[f'/weblog/archives/{post_id.zfill(6)}.html'] = hugo_url
            mapping[f'/weblog/archives/{post_id}.html'] = hugo_url

    # Also map by date-based archives
    for post in POSTS_DIR.glob('*.md'):
        # Extract date from filename (e.g., "2003-11-18-...")
        match = re.match(r'(\d{4})-(\d{2})-(\d{2})', post.name)
        if match:
            year, month, day = match.groups()
            hugo_url = f'/posts/{post.stem}/'

            # Read the post to get the exact timestamp if needed
            with open(post, 'r', encoding='utf-8') as f:
                content = f.read()
                # Look for date in front matter
                date_match = re.search(r"date:\s*['\"]([^'\"]+)['\"]", content)
                if date_match:
                    try:
                        post_date = datetime.fromisoformat(date_match.group(1).replace('T', ' ').split('+')[0].split('-')[0])

                        # Create various archive URL formats that WordPress used
                        # Format: /archive/YYYY_MM_DD_archive.html
                        archive_date = f"{year}_{month}_{day}"
                        mapping[f'/weblog/archive/{archive_date}_archive.html'] = hugo_url

                        # Weekly archives: /archive/YYYY_MM_DD_archive.html (Sunday of that week)
                        week_start = post_date.strftime('%Y_%m_%d')
                        mapping[f'/weblog/archive/{week_start}_archive.html'] = hugo_url
                    except:
                        pass

    return mapping

def update_archive_links():
    """Update archive links in all posts"""
    mapping = build_post_mapping()
    print(f"Built mapping for {len(mapping)} archive URLs")

    updated_posts = 0
    total_replacements = 0

    for post in POSTS_DIR.glob('*.md'):
        with open(post, 'r', encoding='utf-8') as f:
            content = f.read()

        original_content = content
        replacements = 0

        # Find all ericwagoner.com links
        for old_url, new_url in mapping.items():
            full_old_url = f'http://www.ericwagoner.com{old_url}'
            if full_old_url in content:
                content = content.replace(full_old_url, new_url)
                replacements += 1
                print(f"  Mapped: {old_url} -> {new_url}")

        # Also handle archive links with anchors (e.g., /archive/1999_10_31_archive.html#10460)
        archive_pattern = r'http://www\.ericwagoner\.com/weblog/archive/(\d{4}_\d{2}_\d{2})_archive\.html(?:#\d+)?'

        def replace_archive_link(match):
            date_str = match.group(1)
            # Try to find a post from that date
            year, month, day = date_str.split('_')
            date_prefix = f"{year}-{month}-{day}"

            # Find posts from that date
            matching_posts = list(POSTS_DIR.glob(f"{date_prefix}*.md"))
            if matching_posts:
                # Use the first post from that date
                return f'/posts/{matching_posts[0].stem}/'
            return match.group(0)  # Keep original if no match

        content = re.sub(archive_pattern, replace_archive_link, content)
        if content != original_content:
            replacements = content.count('/posts/') - original_content.count('/posts/')

        # Handle generic weblog links
        content = content.replace('http://www.ericwagoner.com/weblog/', '/')
        content = content.replace('http://www.ericwagoner.com/weblog', '/')

        if content != original_content:
            with open(post, 'w', encoding='utf-8') as f:
                f.write(content)
            updated_posts += 1
            total_replacements += replacements
            if replacements > 0:
                print(f"Updated {post.name}: {replacements} archive links")

    return updated_posts, total_replacements

def main():
    print("Remapping WordPress archive links to Hugo posts...")

    # First, show sample of archive links that exist
    sample_links = set()
    for post in list(POSTS_DIR.glob('*.md'))[:100]:
        with open(post, 'r', encoding='utf-8') as f:
            content = f.read()
        links = re.findall(r'http://www\.ericwagoner\.com/weblog/archive/[^)\s"]+', content)
        sample_links.update(links)

    if sample_links:
        print("\nSample archive links found:")
        for link in list(sample_links)[:10]:
            print(f"  {link}")

    # Update the links
    updated, total = update_archive_links()

    print(f"\n✅ Remapping complete!")
    print(f"  Updated {updated} posts")
    print(f"  Remapped {total} archive links")

    # Check what's left
    remaining = 0
    for post in POSTS_DIR.glob('*.md'):
        with open(post, 'r', encoding='utf-8') as f:
            content = f.read()
        if 'http://www.ericwagoner.com' in content:
            remaining += content.count('http://www.ericwagoner.com')

    print(f"\n📊 Remaining external links: {remaining}")

if __name__ == "__main__":
    main()