kestrelsnest-blog/wordpress_to_hugo.py

#!/usr/bin/env python3
"""
Convert WordPress posts from MySQL database to Hugo markdown files
"""

import os
import re
import mysql.connector
from datetime import datetime
import html2text
import yaml
from pathlib import Path
import unicodedata

def slugify(value):
    """Convert a string to a valid filename/slug"""
    value = unicodedata.normalize('NFKD', value)
    value = re.sub(r'[^\w\s-]', '', value).strip().lower()
    value = re.sub(r'[-\s]+', '-', value)
    return value[:100]  # Limit length

def clean_content(content):
    """Clean and convert WordPress HTML content to Markdown"""
    if not content:
        return ""

    # Initialize html2text
    h = html2text.HTML2Text()
    h.body_width = 0  # Don't wrap lines
    h.unicode_snob = True  # Use unicode characters
    h.images_as_html = False  # Convert images to markdown
    h.links_each_paragraph = False

    # Convert to markdown
    markdown = h.handle(content)

    # Clean up common WordPress artifacts
    markdown = re.sub(r'\[caption[^\]]*\](.*?)\[/caption\]', r'\1', markdown)
    markdown = re.sub(r'<!--.*?-->', '', markdown, flags=re.DOTALL)

    return markdown.strip()

def get_categories_and_tags(post_id, cursor):
    """Get categories and tags for a post"""
    categories = []
    tags = []

    try:
        # Check if taxonomy tables exist
        cursor.execute("SHOW TABLES LIKE 'wp_terms'")
        if cursor.fetchone():
            query = """
            SELECT t.name, tt.taxonomy
            FROM wp_terms t
            JOIN wp_term_taxonomy tt ON t.term_id = tt.term_id
            JOIN wp_term_relationships tr ON tt.term_taxonomy_id = tr.term_taxonomy_id
            WHERE tr.object_id = %s
            AND tt.taxonomy IN ('category', 'post_tag')
            """

            cursor.execute(query, (post_id,))
            for name, taxonomy in cursor.fetchall():
                if taxonomy == 'category' and name.lower() != 'uncategorized':
                    categories.append(name)
                elif taxonomy == 'post_tag':
                    tags.append(name)
    except Exception:
        # If tables don't exist or error, just return empty lists
        pass

    return categories, tags

def convert_posts():
    """Main conversion function"""
    # Database connection
    conn = mysql.connector.connect(
        host='localhost',
        user='root',
        password='',
        database='wordpress_import',
        charset='utf8mb4',
        collation='utf8mb4_unicode_ci'
    )
    cursor = conn.cursor()

    # Create output directory
    output_dir = Path('/Users/ericwagoner/Sites/blog/content/posts')
    output_dir.mkdir(parents=True, exist_ok=True)

    # Fetch all published posts
    query = """
    SELECT ID, post_title, post_content, post_date, post_name, post_excerpt
    FROM wp_posts
    WHERE post_status = 'publish'
    AND (post_type = 'post' OR post_type = '')
    ORDER BY post_date ASC
    """

    cursor.execute(query)
    posts = cursor.fetchall()

    print(f"Found {len(posts)} posts to convert")

    converted_count = 0
    errors = []

    for post_id, title, content, post_date, post_name, excerpt in posts:
        try:
            # Skip if no title and no content
            if not title and not content:
                continue

            # Use post_name as slug if available, otherwise create from title
            if post_name:
                slug = post_name[:100]
            elif title:
                slug = slugify(title)
            else:
                slug = f"post-{post_id}"

            # Get categories and tags
            categories, tags = get_categories_and_tags(post_id, cursor)

            # Create filename with date
            date_str = post_date.strftime('%Y-%m-%d')
            filename = f"{date_str}-{slug}.md"
            filepath = output_dir / filename

            # Check if file already exists
            if filepath.exists():
                # Add post ID to make unique
                filename = f"{date_str}-{slug}-{post_id}.md"
                filepath = output_dir / filename

            # Convert content to markdown
            markdown_content = clean_content(content)

            # Create front matter
            front_matter = {
                'title': title or f"Post {post_id}",
                'date': post_date.isoformat(),
                'draft': False,
                'author': 'Eric Wagoner'
            }

            if excerpt:
                front_matter['description'] = excerpt.strip()

            if categories:
                front_matter['categories'] = categories

            if tags:
                front_matter['tags'] = tags

            # Write the file
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write('---\n')
                f.write(yaml.dump(front_matter, default_flow_style=False, allow_unicode=True))
                f.write('---\n\n')
                f.write(markdown_content)

            converted_count += 1
            if converted_count % 100 == 0:
                print(f"Converted {converted_count} posts...")

        except Exception as e:
            error_msg = f"Error converting post {post_id} ('{title}'): {str(e)}"
            errors.append(error_msg)
            print(error_msg)
            continue

    cursor.close()
    conn.close()

    print(f"\n✅ Successfully converted {converted_count} posts")
    if errors:
        print(f"⚠️  {len(errors)} posts had errors:")
        for error in errors[:10]:  # Show first 10 errors
            print(f"  - {error}")
        if len(errors) > 10:
            print(f"  ... and {len(errors) - 10} more")

    return converted_count, len(errors)

if __name__ == "__main__":
    print("Starting WordPress to Hugo conversion...")
    converted, errors = convert_posts()
    print(f"\nConversion complete: {converted} posts converted, {errors} errors")