Files
kestrelsnest-blog/migrate_images.py
Eric Wagoner eddd9d2a80 Import WordPress posts and migrate standalone content to Hugo
- Successfully imported 1731 WordPress posts to Hugo markdown format
- Migrated 204+ images from archive to static directory
- Copied standalone directories (curtain, farm, gobbler, house, images, party, revcemetery, railsday, birthday)
- Fixed all internal links to use /legacy prefix for archived content
- Remapped archive links to point to correct Hugo posts
- Fixed Louisville Georgia Cemetery post rendering issue

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-23 16:23:40 -04:00

134 lines
4.0 KiB
Python

#!/usr/bin/env python3
"""
Find and migrate images from old WordPress site to Hugo
"""
import os
import re
import shutil
from pathlib import Path
from urllib.parse import urlparse
# Directories
POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')
ARCHIVE_DIR = Path('/Users/ericwagoner/Sites/ericwagoner.com')
STATIC_DIR = Path('/Users/ericwagoner/Sites/blog/static')
def extract_image_urls():
"""Extract all image URLs from posts"""
image_urls = set()
for post in POSTS_DIR.glob('*.md'):
with open(post, 'r', encoding='utf-8') as f:
content = f.read()
# Find markdown image links
md_images = re.findall(r'!\[.*?\]\((http://www\.ericwagoner\.com/[^)]+(?:jpg|jpeg|gif|png))\)', content)
image_urls.update(md_images)
# Find HTML img tags
html_images = re.findall(r'src="(http://www\.ericwagoner\.com/[^"]+(?:jpg|jpeg|gif|png))"', content)
image_urls.update(html_images)
return sorted(image_urls)
def find_image_in_archive(url):
"""Find the image file in the archive"""
parsed = urlparse(url)
path = parsed.path
# Try direct path first
full_path = ARCHIVE_DIR / path.lstrip('/')
if full_path.exists():
return full_path
# Sometimes weblog is in the path but not in archive
if '/weblog/' in path:
alt_path = path.replace('/weblog/', '/', 1)
full_path = ARCHIVE_DIR / alt_path.lstrip('/')
if full_path.exists():
return full_path
return None
def copy_image_to_hugo(src_path, url):
"""Copy image to Hugo static directory and return new path"""
parsed = urlparse(url)
path = parsed.path
# Create directory structure in static
relative_path = path.lstrip('/')
dest_path = STATIC_DIR / 'images' / 'legacy' / relative_path
# Create parent directories
dest_path.parent.mkdir(parents=True, exist_ok=True)
# Copy the file
try:
shutil.copy2(src_path, dest_path)
# Return the new URL path
return f'/images/legacy/{relative_path}'
except Exception as e:
print(f"Error copying {src_path}: {e}")
return None
def update_posts_with_new_urls(url_mapping):
"""Update all posts with new image URLs"""
for post in POSTS_DIR.glob('*.md'):
with open(post, 'r', encoding='utf-8') as f:
content = f.read()
modified = False
for old_url, new_url in url_mapping.items():
if old_url in content:
# Escape special characters for regex
escaped_old = re.escape(old_url)
content = re.sub(escaped_old, new_url, content)
modified = True
if modified:
with open(post, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Updated {post.name}")
def main():
"""Main migration function"""
print("Extracting image URLs from posts...")
image_urls = extract_image_urls()
print(f"Found {len(image_urls)} unique image URLs")
url_mapping = {}
found_count = 0
not_found = []
print("\nSearching for images in archive...")
for url in image_urls:
src_path = find_image_in_archive(url)
if src_path:
new_url = copy_image_to_hugo(src_path, url)
if new_url:
url_mapping[url] = new_url
found_count += 1
if found_count % 10 == 0:
print(f"Copied {found_count} images...")
else:
not_found.append(url)
print(f"\n✅ Successfully copied {found_count} images")
if not_found:
print(f"⚠️ Could not find {len(not_found)} images:")
for url in not_found[:5]:
print(f" - {url}")
if len(not_found) > 5:
print(f" ... and {len(not_found) - 5} more")
print("\nUpdating posts with new image URLs...")
update_posts_with_new_urls(url_mapping)
print(f"\n✅ Migration complete: {found_count} images migrated")
return found_count, len(not_found)
if __name__ == "__main__":
found, missing = main()