#!/usr/bin/env python3 from bs4 import BeautifulSoup from datetime import datetime, timezone from markdownify import MarkdownConverter from os import utime from pathlib import Path from re import search, sub from sys import stderr from urllib.parse import unquote from yaml import CDumper, dump import requests BASE_URL = 'https://kontakt-bamberg.de' GALLERIES_PATH = '/galerien' GALLERIES_URL = f'{BASE_URL}{GALLERIES_PATH}' TARGET_DIR = Path(__file__).resolve().parent.parent / 'content' / 'galerie' DATE_PATTERN = r'([DFMS]\w+, \d{1,2}. [ADFJMNOS]\w+ \d{4})' DATE_RANGE_SEPARATOR_PATTERN = r'bis\s+<.+>' DATE_RANGE_PATTERN = f'{DATE_RANGE_SEPARATOR_PATTERN}{DATE_PATTERN}' MONTH_NAMES = ('Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember') MONTHS = {month: str(i+1).zfill(2) for i, month in enumerate(MONTH_NAMES)} def convert_date(date): components = date.split() return f'{components[3]}-{MONTHS[components[2]]}-{components[1].rstrip(".").zfill(2)}' def minify_whitespace(soup): if hasattr(soup, 'contents'): for child in soup.contents: minify_whitespace(child) else: soup.replace_with(sub(r'\s+', ' ', soup)) def parse_http_datetime(dt): return datetime.strptime(dt.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z') def download_image(url, target): name = unquote(url.rsplit('/', 1)[-1]) image = target / name if image.exists(): r = requests.head(url) r.raise_for_status() last_modified = parse_http_datetime(r.headers['last-modified']) mtime = datetime.fromtimestamp(image.stat().st_mtime, tz=timezone.utc) if last_modified <= mtime: return r = requests.get(url) r.raise_for_status() image.write_bytes(r.content) last_modified = parse_http_datetime(r.headers['last-modified']) timestamp = last_modified.timestamp() utime(image, times=(timestamp, timestamp)) print(f"downloaded image '{name}' (last modified: {last_modified.strftime('%a, %d %b %Y %H:%M:%S %Z')})", file=stderr) def download_gallery(path): slug = unquote(path.removeprefix(f'{GALLERIES_PATH}/')) target = TARGET_DIR / slug target.mkdir(parents=True, exist_ok=True) r = requests.get(f'{BASE_URL}{path}') r.raise_for_status() date = search(DATE_RANGE_PATTERN, r.text) if date is None: date = search(DATE_PATTERN, r.text) date = convert_date(date[1]) soup = BeautifulSoup(r.text, 'html.parser') minify_whitespace(soup) title = str(soup.find(id='page-title').string) md = MarkdownConverter() content = '\n\n'.join(md.convert_soup(p) for p in soup.select('.field-type-text-with-summary > .field-items > .field-item > p')) index = target / 'index.md' index.write_text(f'''--- slug: "{slug}" title: {dump(title, Dumper=CDumper).rstrip()} date: {date} --- {content} ''') picture = soup.find(class_='picture') preview = search('https?://kontakt-bamberg.de[-./\w]+', picture.get('style')) download_image(preview[0], target) for a in soup.select('.field-name-field-pictures > .field-items > .field-item > a'): href = a.get('href') try: download_image(href, target) except Exception as e: print(f"downloading image '{href}' has failed: {str(e)}", file=stderr) if __name__ == '__main__': r = requests.get(GALLERIES_URL) r.raise_for_status() soup = BeautifulSoup(r.text, 'html.parser') for a in soup.select('.views-field-field-picture > .field-content > a'): href = a.get('href') try: download_gallery(href) except Exception as e: print(f"downloading gallery at '{href}' has failed: {str(e)}", file=stderr)