Add script to download galleries (only text content for now) from current kontakt website
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Luca 2023-03-30 00:52:40 +02:00
parent 85a39501fb
commit d96e125691
3 changed files with 87 additions and 0 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
.hugo_build.lock
env/
public/
resources/_gen/

75
bin/download_galleries.py Executable file
View File

@ -0,0 +1,75 @@
#!/usr/bin/env python3
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
from pathlib import Path
from re import search, sub
from sys import stderr
from yaml import CDumper, dump
import requests
BASE_URL = 'https://kontakt-bamberg.de'
GALLERIES_PATH = '/galerien'
GALLERIES_URL = f'{BASE_URL}{GALLERIES_PATH}'
TARGET_DIR = Path(__file__).resolve().parent.parent / 'content' / 'galerie'
DATE_PATTERN = r'[DFMS]\w+, \d{1,2}. [ADFJMNOS]\w+ \d{4}'
DATE_RANGE_PATTERN = f'({DATE_PATTERN})(?:| bis ({DATE_PATTERN}))'
MONTH_NAMES = ('Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember')
MONTHS = {month: str(i+1).zfill(2) for i, month in enumerate(MONTH_NAMES)}
def convert_date(date):
components = date.split()
return f'{components[3]}-{MONTHS[components[2]]}-{components[1].rstrip(".").zfill(2)}'
def minify_whitespace(soup):
if hasattr(soup, 'contents'):
for child in soup.contents:
minify_whitespace(child)
else:
soup.replace_with(sub(r'\s+', ' ', soup))
def download_gallery(path):
slug = path.removeprefix(f'{GALLERIES_PATH}/')
target = TARGET_DIR / slug
target.mkdir(parents=True, exist_ok=True)
r = requests.get(f'{BASE_URL}{path}')
r.raise_for_status()
date = search(DATE_RANGE_PATTERN, r.text)
date = convert_date(date[2] if date[2] else date[1])
soup = BeautifulSoup(r.text, 'html.parser')
minify_whitespace(soup)
title = str(soup.find(id='page-title').string)
md = MarkdownConverter()
content = '\n\n'.join(md.convert_soup(p) for p in soup.select('.field-type-text-with-summary > .field-items > .field-item > p'))
index = target / 'index.md'
index.write_text(f'''---
slug: "{slug}"
title: {dump(title, Dumper=CDumper)}
date: {date}
---
{content}
''')
if __name__ == '__main__':
r = requests.get(GALLERIES_URL)
r.raise_for_status()
soup = BeautifulSoup(r.text, 'html.parser')
for a in soup.select('.views-field-field-picture > .field-content > a'):
href = a.get('href')
try:
download_gallery(href)
except Exception as e:
print(f"downloading gallery at '{href}' has failed: {str(e)}", file=stderr)

11
requirements.txt Normal file
View File

@ -0,0 +1,11 @@
beautifulsoup4==4.12.0
certifi==2022.12.7
charset-normalizer==3.1.0
idna==3.4
markdownify==0.11.6
pathlib==1.0.1
PyYAML==6.0
requests==2.28.2
six==1.16.0
soupsieve==2.4
urllib3==1.26.15