Add script to download galleries (only text content for now) from current kontakt website

2023-03-30 00:52:40 +02:00 · 2023-03-30 00:52:40 +02:00 · d96e125691
parent 85a39501fb
commit d96e125691
3 changed files with 87 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 .hugo_build.lock
+env/
 public/
 resources/_gen/
--- a/bin/download_galleries.py
+++ b/bin/download_galleries.py
@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+from bs4 import BeautifulSoup
+from markdownify import MarkdownConverter
+from pathlib import Path
+from re import search, sub
+from sys import stderr
+from yaml import CDumper, dump
+import requests
+
+BASE_URL = 'https://kontakt-bamberg.de'
+
+GALLERIES_PATH = '/galerien'
+GALLERIES_URL = f'{BASE_URL}{GALLERIES_PATH}'
+
+TARGET_DIR = Path(__file__).resolve().parent.parent / 'content' / 'galerie'
+
+DATE_PATTERN = r'[DFMS]\w+, \d{1,2}. [ADFJMNOS]\w+ \d{4}'
+DATE_RANGE_PATTERN = f'({DATE_PATTERN})(?:| bis ({DATE_PATTERN}))'
+
+MONTH_NAMES = ('Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember')
+MONTHS = {month: str(i+1).zfill(2) for i, month in enumerate(MONTH_NAMES)}
+
+def convert_date(date):
+    components = date.split()
+    return f'{components[3]}-{MONTHS[components[2]]}-{components[1].rstrip(".").zfill(2)}'
+
+def minify_whitespace(soup):
+    if hasattr(soup, 'contents'):
+        for child in soup.contents:
+            minify_whitespace(child)
+    else:
+        soup.replace_with(sub(r'\s+', ' ', soup))
+
+def download_gallery(path):
+    slug = path.removeprefix(f'{GALLERIES_PATH}/')
+
+    target = TARGET_DIR / slug
+    target.mkdir(parents=True, exist_ok=True)
+
+    r = requests.get(f'{BASE_URL}{path}')
+    r.raise_for_status()
+
+    date = search(DATE_RANGE_PATTERN, r.text)
+    date = convert_date(date[2] if date[2] else date[1])
+
+    soup = BeautifulSoup(r.text, 'html.parser')
+    minify_whitespace(soup)
+
+    title = str(soup.find(id='page-title').string)
+
+    md = MarkdownConverter()
+    content = '\n\n'.join(md.convert_soup(p) for p in soup.select('.field-type-text-with-summary > .field-items > .field-item > p'))
+
+    index = target / 'index.md'
+    index.write_text(f'''---
+slug: "{slug}"
+title: {dump(title, Dumper=CDumper)}
+date: {date}
+---
+
+{content}
+''')
+
+if __name__ == '__main__':
+    r = requests.get(GALLERIES_URL)
+    r.raise_for_status()
+
+    soup = BeautifulSoup(r.text, 'html.parser')
+    for a in soup.select('.views-field-field-picture > .field-content > a'):
+        href = a.get('href')
+        try:
+            download_gallery(href)
+        except Exception as e:
+            print(f"downloading gallery at '{href}' has failed: {str(e)}", file=stderr)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,11 @@
+beautifulsoup4==4.12.0
+certifi==2022.12.7
+charset-normalizer==3.1.0
+idna==3.4
+markdownify==0.11.6
+pathlib==1.0.1
+PyYAML==6.0
+requests==2.28.2
+six==1.16.0
+soupsieve==2.4
+urllib3==1.26.15