Files
kestrelsnest-blog/migrate_all_links.py
Eric Wagoner eddd9d2a80 Import WordPress posts and migrate standalone content to Hugo
- Successfully imported 1731 WordPress posts to Hugo markdown format
- Migrated 204+ images from archive to static directory
- Copied standalone directories (curtain, farm, gobbler, house, images, party, revcemetery, railsday, birthday)
- Fixed all internal links to use /legacy prefix for archived content
- Remapped archive links to point to correct Hugo posts
- Fixed Louisville Georgia Cemetery post rendering issue

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-23 16:23:40 -04:00

230 lines
7.7 KiB
Python

#!/usr/bin/env python3
"""
Find and migrate all remaining ericwagoner.com links - images, pages, and internal blog links
"""
import os
import re
import shutil
from pathlib import Path
from collections import defaultdict
POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')
ARCHIVE_DIR = Path('/Users/ericwagoner/Sites/ericwagoner.com')
STATIC_DIR = Path('/Users/ericwagoner/Sites/blog/static')
def find_all_external_links():
"""Find all links to ericwagoner.com"""
all_links = defaultdict(list)
for post in POSTS_DIR.glob('*.md'):
with open(post, 'r', encoding='utf-8') as f:
content = f.read()
# Find all links to ericwagoner.com
# Both markdown links and HTML links
patterns = [
r'\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)', # Markdown links
r'href="(http://www\.ericwagoner\.com[^"]+)"', # HTML links
r'src="(http://www\.ericwagoner\.com[^"]+)"', # Image sources
r'!\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)', # Image links
]
for pattern in patterns:
for match in re.finditer(pattern, content):
url = match.group(1) if 'href' in pattern or 'src' in pattern else match.group(2)
all_links[url].append(post.name)
return all_links
def categorize_links(links):
"""Categorize links by type"""
categories = {
'images': {},
'blog_posts': {},
'html_pages': {},
'other': {}
}
for url in links:
if re.search(r'\.(jpg|jpeg|gif|png)$', url, re.IGNORECASE):
categories['images'][url] = links[url]
elif '/weblog/archives/' in url and url.endswith('.html'):
categories['blog_posts'][url] = links[url]
elif url.endswith('.html') or url.endswith('.htm'):
categories['html_pages'][url] = links[url]
else:
categories['other'][url] = links[url]
return categories
def migrate_remaining_images(images):
"""Copy remaining images to static"""
migrated = 0
not_found = []
for url in images:
# Parse URL path
path = url.replace('http://www.ericwagoner.com', '')
# Check if already migrated
dest_path = STATIC_DIR / 'images' / 'legacy' / path.lstrip('/')
if dest_path.exists():
continue
# Find in archive
src_path = ARCHIVE_DIR / path.lstrip('/')
if not src_path.exists() and '/weblog/' in path:
alt_path = path.replace('/weblog/', '/', 1)
src_path = ARCHIVE_DIR / alt_path.lstrip('/')
if src_path.exists():
dest_path.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copy2(src_path, dest_path)
migrated += 1
print(f" Copied image: {path}")
except Exception as e:
print(f" Error copying {src_path}: {e}")
else:
not_found.append(url)
return migrated, not_found
def find_wordpress_post_id(url):
"""Try to find WordPress post ID from URL"""
# Pattern for WordPress archives URLs
patterns = [
r'/archives/(\d+)\.html', # /archives/001234.html
r'/\?p=(\d+)', # /?p=1234
r'/archives/.*\.php\?id=(\d+)', # /archives/something.php?id=1234
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1).lstrip('0') # Remove leading zeros
return None
def migrate_html_pages(pages):
"""Copy standalone HTML pages to static"""
migrated = 0
not_found = []
for url in pages:
path = url.replace('http://www.ericwagoner.com', '')
# Check if already exists
dest_path = STATIC_DIR / 'legacy' / path.lstrip('/')
if dest_path.exists():
continue
# Find in archive
src_path = ARCHIVE_DIR / path.lstrip('/')
if not src_path.exists() and '/weblog/' in path:
alt_path = path.replace('/weblog/', '/', 1)
src_path = ARCHIVE_DIR / alt_path.lstrip('/')
if src_path.exists():
dest_path.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copy2(src_path, dest_path)
migrated += 1
print(f" Copied HTML page: {path}")
except Exception as e:
print(f" Error copying {src_path}: {e}")
else:
not_found.append(url)
return migrated, not_found
def update_links_in_posts(categorized):
"""Update all links in posts"""
updates = 0
for post in POSTS_DIR.glob('*.md'):
with open(post, 'r', encoding='utf-8') as f:
content = f.read()
original_content = content
# Update image links
for url in categorized['images']:
path = url.replace('http://www.ericwagoner.com', '')
new_path = f'/images/legacy{path}'
content = content.replace(url, new_path)
# Update HTML page links
for url in categorized['html_pages']:
path = url.replace('http://www.ericwagoner.com', '')
new_path = f'/legacy{path}'
content = content.replace(url, new_path)
# Update internal blog post links (try to find matching posts)
for url in categorized['blog_posts']:
post_id = find_wordpress_post_id(url)
if post_id:
# Look for a post with this ID in the filename
matching_posts = list(POSTS_DIR.glob(f'*-{post_id}.md'))
if not matching_posts:
# Try without leading zeros
matching_posts = list(POSTS_DIR.glob(f'*-{int(post_id)}.md'))
if matching_posts:
# Get the post slug
post_name = matching_posts[0].stem
new_url = f'/posts/{post_name}/'
content = content.replace(url, new_url)
print(f" Linked to internal post: {url} -> {new_url}")
if content != original_content:
with open(post, 'w', encoding='utf-8') as f:
f.write(content)
updates += 1
return updates
def main():
print("Finding all external links to ericwagoner.com...")
all_links = find_all_external_links()
print(f"Found {len(all_links)} unique URLs")
print("\nCategorizing links...")
categorized = categorize_links(all_links)
print(f" Images: {len(categorized['images'])}")
print(f" Blog posts: {len(categorized['blog_posts'])}")
print(f" HTML pages: {len(categorized['html_pages'])}")
print(f" Other: {len(categorized['other'])}")
print("\n=== Migrating remaining images ===")
img_migrated, img_not_found = migrate_remaining_images(categorized['images'])
print(f"✅ Migrated {img_migrated} images")
if img_not_found:
print(f"⚠️ Could not find {len(img_not_found)} images")
print("\n=== Migrating HTML pages ===")
html_migrated, html_not_found = migrate_html_pages(categorized['html_pages'])
print(f"✅ Migrated {html_migrated} HTML pages")
if html_not_found:
print(f"⚠️ Could not find {len(html_not_found)} HTML pages")
print("\n=== Updating links in posts ===")
updated = update_links_in_posts(categorized)
print(f"✅ Updated {updated} posts")
# Show sample of "other" links for review
if categorized['other']:
print("\n=== Other links that may need attention ===")
for url in list(categorized['other'].keys())[:10]:
print(f" {url}")
if len(categorized['other']) > 10:
print(f" ... and {len(categorized['other']) - 10} more")
print("\n✅ Migration complete!")
return len(all_links)
if __name__ == "__main__":
main()