Import WordPress posts and migrate standalone content to Hugo
- Successfully imported 1731 WordPress posts to Hugo markdown format - Migrated 204+ images from archive to static directory - Copied standalone directories (curtain, farm, gobbler, house, images, party, revcemetery, railsday, birthday) - Fixed all internal links to use /legacy prefix for archived content - Remapped archive links to point to correct Hugo posts - Fixed Louisville Georgia Cemetery post rendering issue 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
230
migrate_all_links.py
Normal file
230
migrate_all_links.py
Normal file
@@ -0,0 +1,230 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Find and migrate all remaining ericwagoner.com links - images, pages, and internal blog links
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')
|
||||
ARCHIVE_DIR = Path('/Users/ericwagoner/Sites/ericwagoner.com')
|
||||
STATIC_DIR = Path('/Users/ericwagoner/Sites/blog/static')
|
||||
|
||||
def find_all_external_links():
|
||||
"""Find all links to ericwagoner.com"""
|
||||
all_links = defaultdict(list)
|
||||
|
||||
for post in POSTS_DIR.glob('*.md'):
|
||||
with open(post, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Find all links to ericwagoner.com
|
||||
# Both markdown links and HTML links
|
||||
patterns = [
|
||||
r'\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)', # Markdown links
|
||||
r'href="(http://www\.ericwagoner\.com[^"]+)"', # HTML links
|
||||
r'src="(http://www\.ericwagoner\.com[^"]+)"', # Image sources
|
||||
r'!\[([^\]]*)\]\((http://www\.ericwagoner\.com[^)]+)\)', # Image links
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
for match in re.finditer(pattern, content):
|
||||
url = match.group(1) if 'href' in pattern or 'src' in pattern else match.group(2)
|
||||
all_links[url].append(post.name)
|
||||
|
||||
return all_links
|
||||
|
||||
def categorize_links(links):
|
||||
"""Categorize links by type"""
|
||||
categories = {
|
||||
'images': {},
|
||||
'blog_posts': {},
|
||||
'html_pages': {},
|
||||
'other': {}
|
||||
}
|
||||
|
||||
for url in links:
|
||||
if re.search(r'\.(jpg|jpeg|gif|png)$', url, re.IGNORECASE):
|
||||
categories['images'][url] = links[url]
|
||||
elif '/weblog/archives/' in url and url.endswith('.html'):
|
||||
categories['blog_posts'][url] = links[url]
|
||||
elif url.endswith('.html') or url.endswith('.htm'):
|
||||
categories['html_pages'][url] = links[url]
|
||||
else:
|
||||
categories['other'][url] = links[url]
|
||||
|
||||
return categories
|
||||
|
||||
def migrate_remaining_images(images):
|
||||
"""Copy remaining images to static"""
|
||||
migrated = 0
|
||||
not_found = []
|
||||
|
||||
for url in images:
|
||||
# Parse URL path
|
||||
path = url.replace('http://www.ericwagoner.com', '')
|
||||
|
||||
# Check if already migrated
|
||||
dest_path = STATIC_DIR / 'images' / 'legacy' / path.lstrip('/')
|
||||
if dest_path.exists():
|
||||
continue
|
||||
|
||||
# Find in archive
|
||||
src_path = ARCHIVE_DIR / path.lstrip('/')
|
||||
if not src_path.exists() and '/weblog/' in path:
|
||||
alt_path = path.replace('/weblog/', '/', 1)
|
||||
src_path = ARCHIVE_DIR / alt_path.lstrip('/')
|
||||
|
||||
if src_path.exists():
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
shutil.copy2(src_path, dest_path)
|
||||
migrated += 1
|
||||
print(f" Copied image: {path}")
|
||||
except Exception as e:
|
||||
print(f" Error copying {src_path}: {e}")
|
||||
else:
|
||||
not_found.append(url)
|
||||
|
||||
return migrated, not_found
|
||||
|
||||
def find_wordpress_post_id(url):
|
||||
"""Try to find WordPress post ID from URL"""
|
||||
# Pattern for WordPress archives URLs
|
||||
patterns = [
|
||||
r'/archives/(\d+)\.html', # /archives/001234.html
|
||||
r'/\?p=(\d+)', # /?p=1234
|
||||
r'/archives/.*\.php\?id=(\d+)', # /archives/something.php?id=1234
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return match.group(1).lstrip('0') # Remove leading zeros
|
||||
|
||||
return None
|
||||
|
||||
def migrate_html_pages(pages):
|
||||
"""Copy standalone HTML pages to static"""
|
||||
migrated = 0
|
||||
not_found = []
|
||||
|
||||
for url in pages:
|
||||
path = url.replace('http://www.ericwagoner.com', '')
|
||||
|
||||
# Check if already exists
|
||||
dest_path = STATIC_DIR / 'legacy' / path.lstrip('/')
|
||||
if dest_path.exists():
|
||||
continue
|
||||
|
||||
# Find in archive
|
||||
src_path = ARCHIVE_DIR / path.lstrip('/')
|
||||
if not src_path.exists() and '/weblog/' in path:
|
||||
alt_path = path.replace('/weblog/', '/', 1)
|
||||
src_path = ARCHIVE_DIR / alt_path.lstrip('/')
|
||||
|
||||
if src_path.exists():
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
shutil.copy2(src_path, dest_path)
|
||||
migrated += 1
|
||||
print(f" Copied HTML page: {path}")
|
||||
except Exception as e:
|
||||
print(f" Error copying {src_path}: {e}")
|
||||
else:
|
||||
not_found.append(url)
|
||||
|
||||
return migrated, not_found
|
||||
|
||||
def update_links_in_posts(categorized):
|
||||
"""Update all links in posts"""
|
||||
updates = 0
|
||||
|
||||
for post in POSTS_DIR.glob('*.md'):
|
||||
with open(post, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
original_content = content
|
||||
|
||||
# Update image links
|
||||
for url in categorized['images']:
|
||||
path = url.replace('http://www.ericwagoner.com', '')
|
||||
new_path = f'/images/legacy{path}'
|
||||
content = content.replace(url, new_path)
|
||||
|
||||
# Update HTML page links
|
||||
for url in categorized['html_pages']:
|
||||
path = url.replace('http://www.ericwagoner.com', '')
|
||||
new_path = f'/legacy{path}'
|
||||
content = content.replace(url, new_path)
|
||||
|
||||
# Update internal blog post links (try to find matching posts)
|
||||
for url in categorized['blog_posts']:
|
||||
post_id = find_wordpress_post_id(url)
|
||||
if post_id:
|
||||
# Look for a post with this ID in the filename
|
||||
matching_posts = list(POSTS_DIR.glob(f'*-{post_id}.md'))
|
||||
if not matching_posts:
|
||||
# Try without leading zeros
|
||||
matching_posts = list(POSTS_DIR.glob(f'*-{int(post_id)}.md'))
|
||||
|
||||
if matching_posts:
|
||||
# Get the post slug
|
||||
post_name = matching_posts[0].stem
|
||||
new_url = f'/posts/{post_name}/'
|
||||
content = content.replace(url, new_url)
|
||||
print(f" Linked to internal post: {url} -> {new_url}")
|
||||
|
||||
if content != original_content:
|
||||
with open(post, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
updates += 1
|
||||
|
||||
return updates
|
||||
|
||||
def main():
|
||||
print("Finding all external links to ericwagoner.com...")
|
||||
all_links = find_all_external_links()
|
||||
print(f"Found {len(all_links)} unique URLs")
|
||||
|
||||
print("\nCategorizing links...")
|
||||
categorized = categorize_links(all_links)
|
||||
|
||||
print(f" Images: {len(categorized['images'])}")
|
||||
print(f" Blog posts: {len(categorized['blog_posts'])}")
|
||||
print(f" HTML pages: {len(categorized['html_pages'])}")
|
||||
print(f" Other: {len(categorized['other'])}")
|
||||
|
||||
print("\n=== Migrating remaining images ===")
|
||||
img_migrated, img_not_found = migrate_remaining_images(categorized['images'])
|
||||
print(f"✅ Migrated {img_migrated} images")
|
||||
if img_not_found:
|
||||
print(f"⚠️ Could not find {len(img_not_found)} images")
|
||||
|
||||
print("\n=== Migrating HTML pages ===")
|
||||
html_migrated, html_not_found = migrate_html_pages(categorized['html_pages'])
|
||||
print(f"✅ Migrated {html_migrated} HTML pages")
|
||||
if html_not_found:
|
||||
print(f"⚠️ Could not find {len(html_not_found)} HTML pages")
|
||||
|
||||
print("\n=== Updating links in posts ===")
|
||||
updated = update_links_in_posts(categorized)
|
||||
print(f"✅ Updated {updated} posts")
|
||||
|
||||
# Show sample of "other" links for review
|
||||
if categorized['other']:
|
||||
print("\n=== Other links that may need attention ===")
|
||||
for url in list(categorized['other'].keys())[:10]:
|
||||
print(f" {url}")
|
||||
if len(categorized['other']) > 10:
|
||||
print(f" ... and {len(categorized['other']) - 10} more")
|
||||
|
||||
print("\n✅ Migration complete!")
|
||||
|
||||
return len(all_links)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user