diff --git a/.gitignore b/.gitignore index 2fb78d7..137b231 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .hugo_build.lock +env/ public/ resources/_gen/ diff --git a/bin/download_galleries.py b/bin/download_galleries.py new file mode 100755 index 0000000..500bd60 --- /dev/null +++ b/bin/download_galleries.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +from bs4 import BeautifulSoup +from markdownify import MarkdownConverter +from pathlib import Path +from re import search, sub +from sys import stderr +from yaml import CDumper, dump +import requests + +BASE_URL = 'https://kontakt-bamberg.de' + +GALLERIES_PATH = '/galerien' +GALLERIES_URL = f'{BASE_URL}{GALLERIES_PATH}' + +TARGET_DIR = Path(__file__).resolve().parent.parent / 'content' / 'galerie' + +DATE_PATTERN = r'[DFMS]\w+, \d{1,2}. [ADFJMNOS]\w+ \d{4}' +DATE_RANGE_PATTERN = f'({DATE_PATTERN})(?:| bis ({DATE_PATTERN}))' + +MONTH_NAMES = ('Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember') +MONTHS = {month: str(i+1).zfill(2) for i, month in enumerate(MONTH_NAMES)} + +def convert_date(date): + components = date.split() + return f'{components[3]}-{MONTHS[components[2]]}-{components[1].rstrip(".").zfill(2)}' + +def minify_whitespace(soup): + if hasattr(soup, 'contents'): + for child in soup.contents: + minify_whitespace(child) + else: + soup.replace_with(sub(r'\s+', ' ', soup)) + +def download_gallery(path): + slug = path.removeprefix(f'{GALLERIES_PATH}/') + + target = TARGET_DIR / slug + target.mkdir(parents=True, exist_ok=True) + + r = requests.get(f'{BASE_URL}{path}') + r.raise_for_status() + + date = search(DATE_RANGE_PATTERN, r.text) + date = convert_date(date[2] if date[2] else date[1]) + + soup = BeautifulSoup(r.text, 'html.parser') + minify_whitespace(soup) + + title = str(soup.find(id='page-title').string) + + md = MarkdownConverter() + content = '\n\n'.join(md.convert_soup(p) for p in soup.select('.field-type-text-with-summary > .field-items > .field-item > p')) + + index = target / 'index.md' + index.write_text(f'''--- +slug: "{slug}" +title: {dump(title, Dumper=CDumper)} +date: {date} +--- + +{content} +''') + +if __name__ == '__main__': + r = requests.get(GALLERIES_URL) + r.raise_for_status() + + soup = BeautifulSoup(r.text, 'html.parser') + for a in soup.select('.views-field-field-picture > .field-content > a'): + href = a.get('href') + try: + download_gallery(href) + except Exception as e: + print(f"downloading gallery at '{href}' has failed: {str(e)}", file=stderr) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ab2de36 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +beautifulsoup4==4.12.0 +certifi==2022.12.7 +charset-normalizer==3.1.0 +idna==3.4 +markdownify==0.11.6 +pathlib==1.0.1 +PyYAML==6.0 +requests==2.28.2 +six==1.16.0 +soupsieve==2.4 +urllib3==1.26.15