kestrelsnest-blog/fix_internal_archive_links.py

#!/usr/bin/env python3
"""
Fix internal archive links that point to /legacy/weblog/archive/ instead of Hugo posts
"""

import re
from pathlib import Path
from datetime import datetime

POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')

def find_posts_by_date(year, month, day=None):
    """Find posts from a specific date"""
    if day:
        pattern = f"{year}-{month:02d}-{day:02d}*.md"
    else:
        pattern = f"{year}-{month:02d}-*.md"

    return list(POSTS_DIR.glob(pattern))

def fix_internal_archive_links():
    """Fix archive links that now point to /legacy/weblog/archive/"""

    fixed_count = 0
    total_fixes = 0

    for post in POSTS_DIR.glob('*.md'):
        with open(post, 'r', encoding='utf-8') as f:
            content = f.read()

        original_content = content

        # Pattern for internal archive links
        patterns = [
            # /legacy/weblog/archive/YYYY_MM_DD_archive.html
            (r'/legacy/weblog/archive/(\d{4})_(\d{2})_(\d{2})_archive\.html(?:#\d+)?', 'day'),
            # /legacy/weblog/archives/YYYY_MM_DD_archive.html
            (r'/legacy/weblog/archives/(\d{4})_(\d{2})_(\d{2})_archive\.html(?:#\d+)?', 'day'),
            # /weblog/archive/YYYY_MM_DD_archive.html (in case some weren't updated)
            (r'/weblog/archive/(\d{4})_(\d{2})_(\d{2})_archive\.html(?:#\d+)?', 'day'),
        ]

        for pattern, date_type in patterns:
            matches = list(re.finditer(pattern, content))
            for match in matches:
                year = match.group(1)
                month = match.group(2)
                day = match.group(3) if date_type == 'day' else None

                # Find posts from that date
                matching_posts = find_posts_by_date(int(year), int(month), int(day) if day else None)

                if matching_posts:
                    # Use the first matching post
                    new_url = f'/posts/{matching_posts[0].stem}/'
                    old_url = match.group(0)
                    content = content.replace(old_url, new_url)
                    print(f"  Fixed: {old_url} -> {new_url} in {post.name}")
                    total_fixes += 1
                else:
                    print(f"  Warning: No post found for date {year}-{month}-{day or 'XX'} (from {match.group(0)} in {post.name})")

        if content != original_content:
            with open(post, 'w', encoding='utf-8') as f:
                f.write(content)
            fixed_count += 1

    return fixed_count, total_fixes

def check_remaining_legacy_links():
    """Check for any remaining /legacy/ links that might be broken"""

    legacy_links = set()

    for post in POSTS_DIR.glob('*.md'):
        with open(post, 'r', encoding='utf-8') as f:
            content = f.read()

        # Find all /legacy/ links
        matches = re.findall(r'/legacy/[^)\s"]+', content)
        legacy_links.update(matches)

    return legacy_links

def main():
    print("Fixing internal archive links...")

    # Fix the archive links
    posts_fixed, links_fixed = fix_internal_archive_links()
    print(f"\n✅ Fixed {links_fixed} archive links in {posts_fixed} posts")

    # Check what legacy links remain
    legacy_links = check_remaining_legacy_links()

    if legacy_links:
        print(f"\n📊 Remaining /legacy/ links to verify ({len(legacy_links)} unique):")

        # Group by type
        archives = []
        images = []
        other = []

        for link in sorted(legacy_links):
            if '/archive' in link:
                archives.append(link)
            elif any(ext in link for ext in ['.jpg', '.jpeg', '.gif', '.png']):
                images.append(link)
            else:
                other.append(link)

        if archives:
            print(f"\n  Archive links ({len(archives)}):")
            for link in archives[:5]:
                print(f"    {link}")
            if len(archives) > 5:
                print(f"    ... and {len(archives) - 5} more")

        if images:
            print(f"\n  Image links ({len(images)}):")
            for link in images[:5]:
                print(f"    {link}")
            if len(images) > 5:
                print(f"    ... and {len(images) - 5} more")

        if other:
            print(f"\n  Other links ({len(other)}):")
            for link in other[:5]:
                print(f"    {link}")
            if len(other) > 5:
                print(f"    ... and {len(other) - 5} more")

if __name__ == "__main__":
    main()