- Successfully imported 1731 WordPress posts to Hugo markdown format - Migrated 204+ images from archive to static directory - Copied standalone directories (curtain, farm, gobbler, house, images, party, revcemetery, railsday, birthday) - Fixed all internal links to use /legacy prefix for archived content - Remapped archive links to point to correct Hugo posts - Fixed Louisville Georgia Cemetery post rendering issue 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
230 lines
7.7 KiB
Python
230 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Find and migrate all remaining ericwagoner.com links - images, pages, and internal blog links
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import shutil
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')
|
|
ARCHIVE_DIR = Path('/Users/ericwagoner/Sites/ericwagoner.com')
|
|
STATIC_DIR = Path('/Users/ericwagoner/Sites/blog/static')
|
|
|
|
def find_all_external_links():
|
|
"""Find all links to ericwagoner.com"""
|
|
all_links = defaultdict(list)
|
|
|
|
for post in POSTS_DIR.glob('*.md'):
|
|
with open(post, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Find all links to ericwagoner.com
|
|
# Both markdown links and HTML links
|
|
patterns = [
|
|
r'\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)', # Markdown links
|
|
r'href="(http://www\.ericwagoner\.com[^"]+)"', # HTML links
|
|
r'src="(http://www\.ericwagoner\.com[^"]+)"', # Image sources
|
|
r'!\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)', # Image links
|
|
]
|
|
|
|
for pattern in patterns:
|
|
for match in re.finditer(pattern, content):
|
|
url = match.group(1) if 'href' in pattern or 'src' in pattern else match.group(2)
|
|
all_links[url].append(post.name)
|
|
|
|
return all_links
|
|
|
|
def categorize_links(links):
|
|
"""Categorize links by type"""
|
|
categories = {
|
|
'images': {},
|
|
'blog_posts': {},
|
|
'html_pages': {},
|
|
'other': {}
|
|
}
|
|
|
|
for url in links:
|
|
if re.search(r'\.(jpg|jpeg|gif|png)$', url, re.IGNORECASE):
|
|
categories['images'][url] = links[url]
|
|
elif '/weblog/archives/' in url and url.endswith('.html'):
|
|
categories['blog_posts'][url] = links[url]
|
|
elif url.endswith('.html') or url.endswith('.htm'):
|
|
categories['html_pages'][url] = links[url]
|
|
else:
|
|
categories['other'][url] = links[url]
|
|
|
|
return categories
|
|
|
|
def migrate_remaining_images(images):
|
|
"""Copy remaining images to static"""
|
|
migrated = 0
|
|
not_found = []
|
|
|
|
for url in images:
|
|
# Parse URL path
|
|
path = url.replace('http://www.ericwagoner.com', '')
|
|
|
|
# Check if already migrated
|
|
dest_path = STATIC_DIR / 'images' / 'legacy' / path.lstrip('/')
|
|
if dest_path.exists():
|
|
continue
|
|
|
|
# Find in archive
|
|
src_path = ARCHIVE_DIR / path.lstrip('/')
|
|
if not src_path.exists() and '/weblog/' in path:
|
|
alt_path = path.replace('/weblog/', '/', 1)
|
|
src_path = ARCHIVE_DIR / alt_path.lstrip('/')
|
|
|
|
if src_path.exists():
|
|
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
shutil.copy2(src_path, dest_path)
|
|
migrated += 1
|
|
print(f" Copied image: {path}")
|
|
except Exception as e:
|
|
print(f" Error copying {src_path}: {e}")
|
|
else:
|
|
not_found.append(url)
|
|
|
|
return migrated, not_found
|
|
|
|
def find_wordpress_post_id(url):
|
|
"""Try to find WordPress post ID from URL"""
|
|
# Pattern for WordPress archives URLs
|
|
patterns = [
|
|
r'/archives/(\d+)\.html', # /archives/001234.html
|
|
r'/\?p=(\d+)', # /?p=1234
|
|
r'/archives/.*\.php\?id=(\d+)', # /archives/something.php?id=1234
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
return match.group(1).lstrip('0') # Remove leading zeros
|
|
|
|
return None
|
|
|
|
def migrate_html_pages(pages):
|
|
"""Copy standalone HTML pages to static"""
|
|
migrated = 0
|
|
not_found = []
|
|
|
|
for url in pages:
|
|
path = url.replace('http://www.ericwagoner.com', '')
|
|
|
|
# Check if already exists
|
|
dest_path = STATIC_DIR / 'legacy' / path.lstrip('/')
|
|
if dest_path.exists():
|
|
continue
|
|
|
|
# Find in archive
|
|
src_path = ARCHIVE_DIR / path.lstrip('/')
|
|
if not src_path.exists() and '/weblog/' in path:
|
|
alt_path = path.replace('/weblog/', '/', 1)
|
|
src_path = ARCHIVE_DIR / alt_path.lstrip('/')
|
|
|
|
if src_path.exists():
|
|
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
shutil.copy2(src_path, dest_path)
|
|
migrated += 1
|
|
print(f" Copied HTML page: {path}")
|
|
except Exception as e:
|
|
print(f" Error copying {src_path}: {e}")
|
|
else:
|
|
not_found.append(url)
|
|
|
|
return migrated, not_found
|
|
|
|
def update_links_in_posts(categorized):
|
|
"""Update all links in posts"""
|
|
updates = 0
|
|
|
|
for post in POSTS_DIR.glob('*.md'):
|
|
with open(post, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
original_content = content
|
|
|
|
# Update image links
|
|
for url in categorized['images']:
|
|
path = url.replace('http://www.ericwagoner.com', '')
|
|
new_path = f'/images/legacy{path}'
|
|
content = content.replace(url, new_path)
|
|
|
|
# Update HTML page links
|
|
for url in categorized['html_pages']:
|
|
path = url.replace('http://www.ericwagoner.com', '')
|
|
new_path = f'/legacy{path}'
|
|
content = content.replace(url, new_path)
|
|
|
|
# Update internal blog post links (try to find matching posts)
|
|
for url in categorized['blog_posts']:
|
|
post_id = find_wordpress_post_id(url)
|
|
if post_id:
|
|
# Look for a post with this ID in the filename
|
|
matching_posts = list(POSTS_DIR.glob(f'*-{post_id}.md'))
|
|
if not matching_posts:
|
|
# Try without leading zeros
|
|
matching_posts = list(POSTS_DIR.glob(f'*-{int(post_id)}.md'))
|
|
|
|
if matching_posts:
|
|
# Get the post slug
|
|
post_name = matching_posts[0].stem
|
|
new_url = f'/posts/{post_name}/'
|
|
content = content.replace(url, new_url)
|
|
print(f" Linked to internal post: {url} -> {new_url}")
|
|
|
|
if content != original_content:
|
|
with open(post, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
updates += 1
|
|
|
|
return updates
|
|
|
|
def main():
|
|
print("Finding all external links to ericwagoner.com...")
|
|
all_links = find_all_external_links()
|
|
print(f"Found {len(all_links)} unique URLs")
|
|
|
|
print("\nCategorizing links...")
|
|
categorized = categorize_links(all_links)
|
|
|
|
print(f" Images: {len(categorized['images'])}")
|
|
print(f" Blog posts: {len(categorized['blog_posts'])}")
|
|
print(f" HTML pages: {len(categorized['html_pages'])}")
|
|
print(f" Other: {len(categorized['other'])}")
|
|
|
|
print("\n=== Migrating remaining images ===")
|
|
img_migrated, img_not_found = migrate_remaining_images(categorized['images'])
|
|
print(f"✅ Migrated {img_migrated} images")
|
|
if img_not_found:
|
|
print(f"⚠️ Could not find {len(img_not_found)} images")
|
|
|
|
print("\n=== Migrating HTML pages ===")
|
|
html_migrated, html_not_found = migrate_html_pages(categorized['html_pages'])
|
|
print(f"✅ Migrated {html_migrated} HTML pages")
|
|
if html_not_found:
|
|
print(f"⚠️ Could not find {len(html_not_found)} HTML pages")
|
|
|
|
print("\n=== Updating links in posts ===")
|
|
updated = update_links_in_posts(categorized)
|
|
print(f"✅ Updated {updated} posts")
|
|
|
|
# Show sample of "other" links for review
|
|
if categorized['other']:
|
|
print("\n=== Other links that may need attention ===")
|
|
for url in list(categorized['other'].keys())[:10]:
|
|
print(f" {url}")
|
|
if len(categorized['other']) > 10:
|
|
print(f" ... and {len(categorized['other']) - 10} more")
|
|
|
|
print("\n✅ Migration complete!")
|
|
|
|
return len(all_links)
|
|
|
|
if __name__ == "__main__":
|
|
main() |