Add script to download galleries (only text content for now) from current kontakt website

2023-03-30 00:52:40 +02:00 · 2023-03-30 00:52:40 +02:00 · d96e125691
parent 85a39501fb
commit d96e125691
3 changed files with 87 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 .hugo_build.lock
 env/
 public/
 resources/_gen/
--- a/bin/download_galleries.py
+++ b/bin/download_galleries.py
@ -0,0 +1,75 @@
 #!/usr/bin/env python3
 from bs4 import BeautifulSoup
 from markdownify import MarkdownConverter
 from pathlib import Path
 from re import search, sub
 from sys import stderr
 from yaml import CDumper, dump
 import requests
 BASE_URL = 'https://kontakt-bamberg.de'
 GALLERIES_PATH = '/galerien'
 GALLERIES_URL = f'{BASE_URL}{GALLERIES_PATH}'
 TARGET_DIR = Path(__file__).resolve().parent.parent / 'content' / 'galerie'
 DATE_PATTERN = r'[DFMS]\w+, \d{1,2}. [ADFJMNOS]\w+ \d{4}'
 DATE_RANGE_PATTERN = f'({DATE_PATTERN})(?:| bis ({DATE_PATTERN}))'
 MONTH_NAMES = ('Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember')
 MONTHS = {month: str(i+1).zfill(2) for i, month in enumerate(MONTH_NAMES)}
 def convert_date(date):
    components = date.split()
    return f'{components[3]}-{MONTHS[components[2]]}-{components[1].rstrip(".").zfill(2)}'
 def minify_whitespace(soup):
    if hasattr(soup, 'contents'):
        for child in soup.contents:
            minify_whitespace(child)
    else:
        soup.replace_with(sub(r'\s+', ' ', soup))
 def download_gallery(path):
    slug = path.removeprefix(f'{GALLERIES_PATH}/')
    target = TARGET_DIR / slug
    target.mkdir(parents=True, exist_ok=True)
    r = requests.get(f'{BASE_URL}{path}')
    r.raise_for_status()
    date = search(DATE_RANGE_PATTERN, r.text)
    date = convert_date(date[2] if date[2] else date[1])
    soup = BeautifulSoup(r.text, 'html.parser')
    minify_whitespace(soup)
    title = str(soup.find(id='page-title').string)
    md = MarkdownConverter()
    content = '\n\n'.join(md.convert_soup(p) for p in soup.select('.field-type-text-with-summary > .field-items > .field-item > p'))
    index = target / 'index.md'
    index.write_text(f'''---
 slug: "{slug}"
 title: {dump(title, Dumper=CDumper)}
 date: {date}
 ---
 {content}
 ''')
 if __name__ == '__main__':
    r = requests.get(GALLERIES_URL)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')
    for a in soup.select('.views-field-field-picture > .field-content > a'):
        href = a.get('href')
        try:
            download_gallery(href)
        except Exception as e:
            print(f"downloading gallery at '{href}' has failed: {str(e)}", file=stderr)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,11 @@
 beautifulsoup4==4.12.0
 certifi==2022.12.7
 charset-normalizer==3.1.0
 idna==3.4
 markdownify==0.11.6
 pathlib==1.0.1
 PyYAML==6.0
 requests==2.28.2
 six==1.16.0
 soupsieve==2.4
 urllib3==1.26.15