#!/usr/bin/env python3 from bs4 import BeautifulSoup from markdownify import MarkdownConverter from pathlib import Path from re import search, sub from sys import stderr from yaml import CDumper, dump import requests BASE_URL = 'https://kontakt-bamberg.de' GALLERIES_PATH = '/galerien' GALLERIES_URL = f'{BASE_URL}{GALLERIES_PATH}' TARGET_DIR = Path(__file__).resolve().parent.parent / 'content' / 'galerie' DATE_PATTERN = r'[DFMS]\w+, \d{1,2}. [ADFJMNOS]\w+ \d{4}' DATE_RANGE_PATTERN = f'({DATE_PATTERN})(?:| bis ({DATE_PATTERN}))' MONTH_NAMES = ('Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember') MONTHS = {month: str(i+1).zfill(2) for i, month in enumerate(MONTH_NAMES)} def convert_date(date): components = date.split() return f'{components[3]}-{MONTHS[components[2]]}-{components[1].rstrip(".").zfill(2)}' def minify_whitespace(soup): if hasattr(soup, 'contents'): for child in soup.contents: minify_whitespace(child) else: soup.replace_with(sub(r'\s+', ' ', soup)) def download_gallery(path): slug = path.removeprefix(f'{GALLERIES_PATH}/') target = TARGET_DIR / slug target.mkdir(parents=True, exist_ok=True) r = requests.get(f'{BASE_URL}{path}') r.raise_for_status() date = search(DATE_RANGE_PATTERN, r.text) date = convert_date(date[2] if date[2] else date[1]) soup = BeautifulSoup(r.text, 'html.parser') minify_whitespace(soup) title = str(soup.find(id='page-title').string) md = MarkdownConverter() content = '\n\n'.join(md.convert_soup(p) for p in soup.select('.field-type-text-with-summary > .field-items > .field-item > p')) index = target / 'index.md' index.write_text(f'''--- slug: "{slug}" title: {dump(title, Dumper=CDumper)} date: {date} --- {content} ''') if __name__ == '__main__': r = requests.get(GALLERIES_URL) r.raise_for_status() soup = BeautifulSoup(r.text, 'html.parser') for a in soup.select('.views-field-field-picture > .field-content > a'): href = a.get('href') try: download_gallery(href) except Exception as e: print(f"downloading gallery at '{href}' has failed: {str(e)}", file=stderr)