#!/usr/bin/env python3 """ Remap old WordPress archive links to new Hugo post URLs """ import re from pathlib import Path from datetime import datetime POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts') def build_post_mapping(): """Build mapping from old WordPress IDs/dates to new Hugo URLs""" mapping = {} # Map by WordPress post ID (from filename) for post in POSTS_DIR.glob('*.md'): # Extract post ID if it exists in filename (e.g., "2003-11-18-first-image-1234.md") match = re.search(r'-(\d+)\.md$', post.name) if match: post_id = match.group(1) hugo_url = f'/posts/{post.stem}/' # WordPress archive format: /archives/000123.html mapping[f'/weblog/archives/{post_id.zfill(6)}.html'] = hugo_url mapping[f'/weblog/archives/{post_id}.html'] = hugo_url # Also map by date-based archives for post in POSTS_DIR.glob('*.md'): # Extract date from filename (e.g., "2003-11-18-...") match = re.match(r'(\d{4})-(\d{2})-(\d{2})', post.name) if match: year, month, day = match.groups() hugo_url = f'/posts/{post.stem}/' # Read the post to get the exact timestamp if needed with open(post, 'r', encoding='utf-8') as f: content = f.read() # Look for date in front matter date_match = re.search(r"date:\s*['\"]([^'\"]+)['\"]", content) if date_match: try: post_date = datetime.fromisoformat(date_match.group(1).replace('T', ' ').split('+')[0].split('-')[0]) # Create various archive URL formats that WordPress used # Format: /archive/YYYY_MM_DD_archive.html archive_date = f"{year}_{month}_{day}" mapping[f'/weblog/archive/{archive_date}_archive.html'] = hugo_url # Weekly archives: /archive/YYYY_MM_DD_archive.html (Sunday of that week) week_start = post_date.strftime('%Y_%m_%d') mapping[f'/weblog/archive/{week_start}_archive.html'] = hugo_url except: pass return mapping def update_archive_links(): """Update archive links in all posts""" mapping = build_post_mapping() print(f"Built mapping for {len(mapping)} archive URLs") updated_posts = 0 total_replacements = 0 for post in POSTS_DIR.glob('*.md'): with open(post, 'r', encoding='utf-8') as f: content = f.read() original_content = content replacements = 0 # Find all ericwagoner.com links for old_url, new_url in mapping.items(): full_old_url = f'http://www.ericwagoner.com{old_url}' if full_old_url in content: content = content.replace(full_old_url, new_url) replacements += 1 print(f" Mapped: {old_url} -> {new_url}") # Also handle archive links with anchors (e.g., /archive/1999_10_31_archive.html#10460) archive_pattern = r'http://www\.ericwagoner\.com/weblog/archive/(\d{4}_\d{2}_\d{2})_archive\.html(?:#\d+)?' def replace_archive_link(match): date_str = match.group(1) # Try to find a post from that date year, month, day = date_str.split('_') date_prefix = f"{year}-{month}-{day}" # Find posts from that date matching_posts = list(POSTS_DIR.glob(f"{date_prefix}*.md")) if matching_posts: # Use the first post from that date return f'/posts/{matching_posts[0].stem}/' return match.group(0) # Keep original if no match content = re.sub(archive_pattern, replace_archive_link, content) if content != original_content: replacements = content.count('/posts/') - original_content.count('/posts/') # Handle generic weblog links content = content.replace('http://www.ericwagoner.com/weblog/', '/') content = content.replace('http://www.ericwagoner.com/weblog', '/') if content != original_content: with open(post, 'w', encoding='utf-8') as f: f.write(content) updated_posts += 1 total_replacements += replacements if replacements > 0: print(f"Updated {post.name}: {replacements} archive links") return updated_posts, total_replacements def main(): print("Remapping WordPress archive links to Hugo posts...") # First, show sample of archive links that exist sample_links = set() for post in list(POSTS_DIR.glob('*.md'))[:100]: with open(post, 'r', encoding='utf-8') as f: content = f.read() links = re.findall(r'http://www\.ericwagoner\.com/weblog/archive/[^)\s"]+', content) sample_links.update(links) if sample_links: print("\nSample archive links found:") for link in list(sample_links)[:10]: print(f" {link}") # Update the links updated, total = update_archive_links() print(f"\nāœ… Remapping complete!") print(f" Updated {updated} posts") print(f" Remapped {total} archive links") # Check what's left remaining = 0 for post in POSTS_DIR.glob('*.md'): with open(post, 'r', encoding='utf-8') as f: content = f.read() if 'http://www.ericwagoner.com' in content: remaining += content.count('http://www.ericwagoner.com') print(f"\nšŸ“Š Remaining external links: {remaining}") if __name__ == "__main__": main()