kestrelsnest-blog/quick_link_check.py

#!/usr/bin/env python3
"""
Quick check of external links
"""

import re
from pathlib import Path
from collections import defaultdict

POSTS_DIR = Path('/Users/ericwagoner/Sites/blog/content/posts')

# Just check a sample of posts first
sample_posts = list(POSTS_DIR.glob('*.md'))[:50]

all_links = set()

for post in sample_posts:
    with open(post, 'r', encoding='utf-8') as f:
        content = f.read()

    # Find all ericwagoner.com links
    links = re.findall(r'http://www\.ericwagoner\.com[^\s\)"]+', content)
    all_links.update(links)

# Categorize
images = []
html_pages = []
other = []

for url in all_links:
    if re.search(r'\.(jpg|jpeg|gif|png)$', url, re.IGNORECASE):
        images.append(url)
    elif url.endswith('.html') or url.endswith('.htm'):
        html_pages.append(url)
    else:
        other.append(url)

print(f"Sample from first 50 posts:")
print(f"  Total unique links: {len(all_links)}")
print(f"  Images: {len(images)}")
print(f"  HTML pages: {len(html_pages)}")
print(f"  Other: {len(other)}")

print("\nSample image links:")
for url in images[:5]:
    print(f"  {url}")

print("\nSample HTML page links:")
for url in html_pages[:5]:
    print(f"  {url}")

print("\nSample other links:")
for url in other[:5]:
    print(f"  {url}")