Add script to download galleries (only text content for now) from current kontakt website
continuous-integration/drone/push Build is passing
Details
continuous-integration/drone/push Build is passing
Details
This commit is contained in:
parent
85a39501fb
commit
d96e125691
|
@ -1,3 +1,4 @@
|
|||
.hugo_build.lock
|
||||
env/
|
||||
public/
|
||||
resources/_gen/
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from markdownify import MarkdownConverter
|
||||
from pathlib import Path
|
||||
from re import search, sub
|
||||
from sys import stderr
|
||||
from yaml import CDumper, dump
|
||||
import requests
|
||||
|
||||
BASE_URL = 'https://kontakt-bamberg.de'
|
||||
|
||||
GALLERIES_PATH = '/galerien'
|
||||
GALLERIES_URL = f'{BASE_URL}{GALLERIES_PATH}'
|
||||
|
||||
TARGET_DIR = Path(__file__).resolve().parent.parent / 'content' / 'galerie'
|
||||
|
||||
DATE_PATTERN = r'[DFMS]\w+, \d{1,2}. [ADFJMNOS]\w+ \d{4}'
|
||||
DATE_RANGE_PATTERN = f'({DATE_PATTERN})(?:| bis ({DATE_PATTERN}))'
|
||||
|
||||
MONTH_NAMES = ('Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember')
|
||||
MONTHS = {month: str(i+1).zfill(2) for i, month in enumerate(MONTH_NAMES)}
|
||||
|
||||
def convert_date(date):
|
||||
components = date.split()
|
||||
return f'{components[3]}-{MONTHS[components[2]]}-{components[1].rstrip(".").zfill(2)}'
|
||||
|
||||
def minify_whitespace(soup):
|
||||
if hasattr(soup, 'contents'):
|
||||
for child in soup.contents:
|
||||
minify_whitespace(child)
|
||||
else:
|
||||
soup.replace_with(sub(r'\s+', ' ', soup))
|
||||
|
||||
def download_gallery(path):
|
||||
slug = path.removeprefix(f'{GALLERIES_PATH}/')
|
||||
|
||||
target = TARGET_DIR / slug
|
||||
target.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
r = requests.get(f'{BASE_URL}{path}')
|
||||
r.raise_for_status()
|
||||
|
||||
date = search(DATE_RANGE_PATTERN, r.text)
|
||||
date = convert_date(date[2] if date[2] else date[1])
|
||||
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
minify_whitespace(soup)
|
||||
|
||||
title = str(soup.find(id='page-title').string)
|
||||
|
||||
md = MarkdownConverter()
|
||||
content = '\n\n'.join(md.convert_soup(p) for p in soup.select('.field-type-text-with-summary > .field-items > .field-item > p'))
|
||||
|
||||
index = target / 'index.md'
|
||||
index.write_text(f'''---
|
||||
slug: "{slug}"
|
||||
title: {dump(title, Dumper=CDumper)}
|
||||
date: {date}
|
||||
---
|
||||
|
||||
{content}
|
||||
''')
|
||||
|
||||
if __name__ == '__main__':
|
||||
r = requests.get(GALLERIES_URL)
|
||||
r.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
for a in soup.select('.views-field-field-picture > .field-content > a'):
|
||||
href = a.get('href')
|
||||
try:
|
||||
download_gallery(href)
|
||||
except Exception as e:
|
||||
print(f"downloading gallery at '{href}' has failed: {str(e)}", file=stderr)
|
|
@ -0,0 +1,11 @@
|
|||
beautifulsoup4==4.12.0
|
||||
certifi==2022.12.7
|
||||
charset-normalizer==3.1.0
|
||||
idna==3.4
|
||||
markdownify==0.11.6
|
||||
pathlib==1.0.1
|
||||
PyYAML==6.0
|
||||
requests==2.28.2
|
||||
six==1.16.0
|
||||
soupsieve==2.4
|
||||
urllib3==1.26.15
|
Loading…
Reference in New Issue