Import WordPress posts and migrate standalone content to Hugo
- Successfully imported 1731 WordPress posts to Hugo markdown format - Migrated 204+ images from archive to static directory - Copied standalone directories (curtain, farm, gobbler, house, images, party, revcemetery, railsday, birthday) - Fixed all internal links to use /legacy prefix for archived content - Remapped archive links to point to correct Hugo posts - Fixed Louisville Georgia Cemetery post rendering issue 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
134
migrate_images.py
Normal file
134
migrate_images.py
Normal file
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Find and migrate images from old WordPress site to Hugo
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Directories
|
||||
POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')
|
||||
ARCHIVE_DIR = Path('/Users/ericwagoner/Sites/ericwagoner.com')
|
||||
STATIC_DIR = Path('/Users/ericwagoner/Sites/blog/static')
|
||||
|
||||
def extract_image_urls():
|
||||
"""Extract all image URLs from posts"""
|
||||
image_urls = set()
|
||||
|
||||
for post in POSTS_DIR.glob('*.md'):
|
||||
with open(post, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Find markdown image links
|
||||
md_images = re.findall(r'!\[.*?\]\((http://www\.ericwagoner\.com/[^)]+(?:jpg|jpeg|gif|png))\)', content)
|
||||
image_urls.update(md_images)
|
||||
|
||||
# Find HTML img tags
|
||||
html_images = re.findall(r'src="(http://www\.ericwagoner\.com/[^"]+(?:jpg|jpeg|gif|png))"', content)
|
||||
image_urls.update(html_images)
|
||||
|
||||
return sorted(image_urls)
|
||||
|
||||
def find_image_in_archive(url):
|
||||
"""Find the image file in the archive"""
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path
|
||||
|
||||
# Try direct path first
|
||||
full_path = ARCHIVE_DIR / path.lstrip('/')
|
||||
if full_path.exists():
|
||||
return full_path
|
||||
|
||||
# Sometimes weblog is in the path but not in archive
|
||||
if '/weblog/' in path:
|
||||
alt_path = path.replace('/weblog/', '/', 1)
|
||||
full_path = ARCHIVE_DIR / alt_path.lstrip('/')
|
||||
if full_path.exists():
|
||||
return full_path
|
||||
|
||||
return None
|
||||
|
||||
def copy_image_to_hugo(src_path, url):
|
||||
"""Copy image to Hugo static directory and return new path"""
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path
|
||||
|
||||
# Create directory structure in static
|
||||
relative_path = path.lstrip('/')
|
||||
dest_path = STATIC_DIR / 'images' / 'legacy' / relative_path
|
||||
|
||||
# Create parent directories
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy the file
|
||||
try:
|
||||
shutil.copy2(src_path, dest_path)
|
||||
# Return the new URL path
|
||||
return f'/images/legacy/{relative_path}'
|
||||
except Exception as e:
|
||||
print(f"Error copying {src_path}: {e}")
|
||||
return None
|
||||
|
||||
def update_posts_with_new_urls(url_mapping):
|
||||
"""Update all posts with new image URLs"""
|
||||
for post in POSTS_DIR.glob('*.md'):
|
||||
with open(post, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
modified = False
|
||||
for old_url, new_url in url_mapping.items():
|
||||
if old_url in content:
|
||||
# Escape special characters for regex
|
||||
escaped_old = re.escape(old_url)
|
||||
content = re.sub(escaped_old, new_url, content)
|
||||
modified = True
|
||||
|
||||
if modified:
|
||||
with open(post, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
print(f"Updated {post.name}")
|
||||
|
||||
def main():
|
||||
"""Main migration function"""
|
||||
print("Extracting image URLs from posts...")
|
||||
image_urls = extract_image_urls()
|
||||
print(f"Found {len(image_urls)} unique image URLs")
|
||||
|
||||
url_mapping = {}
|
||||
found_count = 0
|
||||
not_found = []
|
||||
|
||||
print("\nSearching for images in archive...")
|
||||
for url in image_urls:
|
||||
src_path = find_image_in_archive(url)
|
||||
if src_path:
|
||||
new_url = copy_image_to_hugo(src_path, url)
|
||||
if new_url:
|
||||
url_mapping[url] = new_url
|
||||
found_count += 1
|
||||
if found_count % 10 == 0:
|
||||
print(f"Copied {found_count} images...")
|
||||
else:
|
||||
not_found.append(url)
|
||||
|
||||
print(f"\n✅ Successfully copied {found_count} images")
|
||||
|
||||
if not_found:
|
||||
print(f"⚠️ Could not find {len(not_found)} images:")
|
||||
for url in not_found[:5]:
|
||||
print(f" - {url}")
|
||||
if len(not_found) > 5:
|
||||
print(f" ... and {len(not_found) - 5} more")
|
||||
|
||||
print("\nUpdating posts with new image URLs...")
|
||||
update_posts_with_new_urls(url_mapping)
|
||||
|
||||
print(f"\n✅ Migration complete: {found_count} images migrated")
|
||||
|
||||
return found_count, len(not_found)
|
||||
|
||||
if __name__ == "__main__":
|
||||
found, missing = main()
|
Reference in New Issue
Block a user