2023-03-30 00:52:40 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
2023-03-30 16:45:48 +02:00
|
|
|
from datetime import datetime, timezone
|
2023-03-30 00:52:40 +02:00
|
|
|
from markdownify import MarkdownConverter
|
2023-03-30 16:45:48 +02:00
|
|
|
from os import utime
|
2023-03-30 00:52:40 +02:00
|
|
|
from pathlib import Path
|
|
|
|
from re import search, sub
|
|
|
|
from sys import stderr
|
2023-03-30 17:03:50 +02:00
|
|
|
from urllib.parse import unquote
|
2023-03-30 00:52:40 +02:00
|
|
|
from yaml import CDumper, dump
|
|
|
|
import requests
|
|
|
|
|
|
|
|
BASE_URL = 'https://kontakt-bamberg.de'
|
|
|
|
|
|
|
|
GALLERIES_PATH = '/galerien'
|
|
|
|
GALLERIES_URL = f'{BASE_URL}{GALLERIES_PATH}'
|
|
|
|
|
|
|
|
TARGET_DIR = Path(__file__).resolve().parent.parent / 'content' / 'galerie'
|
|
|
|
|
2023-03-30 01:19:28 +02:00
|
|
|
DATE_PATTERN = r'([DFMS]\w+, \d{1,2}. [ADFJMNOS]\w+ \d{4})'
|
|
|
|
DATE_RANGE_SEPARATOR_PATTERN = r'bis\s+<.+>'
|
|
|
|
DATE_RANGE_PATTERN = f'{DATE_RANGE_SEPARATOR_PATTERN}{DATE_PATTERN}'
|
2023-03-30 00:52:40 +02:00
|
|
|
|
|
|
|
MONTH_NAMES = ('Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember')
|
|
|
|
MONTHS = {month: str(i+1).zfill(2) for i, month in enumerate(MONTH_NAMES)}
|
|
|
|
|
|
|
|
def convert_date(date):
|
|
|
|
components = date.split()
|
|
|
|
return f'{components[3]}-{MONTHS[components[2]]}-{components[1].rstrip(".").zfill(2)}'
|
|
|
|
|
|
|
|
def minify_whitespace(soup):
|
|
|
|
if hasattr(soup, 'contents'):
|
|
|
|
for child in soup.contents:
|
|
|
|
minify_whitespace(child)
|
|
|
|
else:
|
|
|
|
soup.replace_with(sub(r'\s+', ' ', soup))
|
|
|
|
|
2023-03-30 16:45:48 +02:00
|
|
|
def parse_http_datetime(dt):
|
|
|
|
return datetime.strptime(dt.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z')
|
|
|
|
|
|
|
|
def download_image(url, target):
|
2023-03-30 17:03:50 +02:00
|
|
|
name = unquote(url.rsplit('/', 1)[-1])
|
2023-03-30 16:45:48 +02:00
|
|
|
image = target / name
|
|
|
|
|
|
|
|
if image.exists():
|
|
|
|
r = requests.head(url)
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
|
|
last_modified = parse_http_datetime(r.headers['last-modified'])
|
|
|
|
mtime = datetime.fromtimestamp(image.stat().st_mtime, tz=timezone.utc)
|
|
|
|
if last_modified <= mtime:
|
|
|
|
return
|
|
|
|
|
|
|
|
r = requests.get(url)
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
|
|
image.write_bytes(r.content)
|
|
|
|
|
|
|
|
last_modified = parse_http_datetime(r.headers['last-modified'])
|
|
|
|
timestamp = last_modified.timestamp()
|
|
|
|
utime(image, times=(timestamp, timestamp))
|
|
|
|
|
|
|
|
print(f"downloaded image '{name}' (last modified: {last_modified.strftime('%a, %d %b %Y %H:%M:%S %Z')})", file=stderr)
|
|
|
|
|
2023-03-30 00:52:40 +02:00
|
|
|
def download_gallery(path):
|
2023-03-30 17:03:50 +02:00
|
|
|
slug = unquote(path.removeprefix(f'{GALLERIES_PATH}/'))
|
2023-03-30 00:52:40 +02:00
|
|
|
|
|
|
|
target = TARGET_DIR / slug
|
|
|
|
target.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
r = requests.get(f'{BASE_URL}{path}')
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
|
|
date = search(DATE_RANGE_PATTERN, r.text)
|
2023-03-30 01:19:28 +02:00
|
|
|
if date is None:
|
|
|
|
date = search(DATE_PATTERN, r.text)
|
|
|
|
date = convert_date(date[1])
|
2023-03-30 00:52:40 +02:00
|
|
|
|
|
|
|
soup = BeautifulSoup(r.text, 'html.parser')
|
|
|
|
minify_whitespace(soup)
|
|
|
|
|
|
|
|
title = str(soup.find(id='page-title').string)
|
|
|
|
|
|
|
|
md = MarkdownConverter()
|
|
|
|
content = '\n\n'.join(md.convert_soup(p) for p in soup.select('.field-type-text-with-summary > .field-items > .field-item > p'))
|
|
|
|
|
2023-04-01 16:36:54 +02:00
|
|
|
picture = soup.find(class_='picture')
|
|
|
|
preview = search('https?://kontakt-bamberg.de[-./\w]+', picture.get('style'))
|
|
|
|
|
2023-03-30 00:52:40 +02:00
|
|
|
index = target / 'index.md'
|
|
|
|
index.write_text(f'''---
|
|
|
|
slug: "{slug}"
|
2023-03-30 01:20:03 +02:00
|
|
|
title: {dump(title, Dumper=CDumper).rstrip()}
|
2023-03-30 00:52:40 +02:00
|
|
|
date: {date}
|
2023-04-01 16:36:54 +02:00
|
|
|
preview: "{unquote(preview[0].rsplit('/', 1)[-1])}"
|
2023-03-30 00:52:40 +02:00
|
|
|
---
|
|
|
|
|
|
|
|
{content}
|
|
|
|
''')
|
|
|
|
|
2023-03-30 16:45:48 +02:00
|
|
|
download_image(preview[0], target)
|
|
|
|
|
|
|
|
for a in soup.select('.field-name-field-pictures > .field-items > .field-item > a'):
|
|
|
|
href = a.get('href')
|
|
|
|
try:
|
|
|
|
download_image(href, target)
|
|
|
|
except Exception as e:
|
|
|
|
print(f"downloading image '{href}' has failed: {str(e)}", file=stderr)
|
|
|
|
|
2023-03-30 00:52:40 +02:00
|
|
|
if __name__ == '__main__':
|
|
|
|
r = requests.get(GALLERIES_URL)
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
|
|
soup = BeautifulSoup(r.text, 'html.parser')
|
|
|
|
for a in soup.select('.views-field-field-picture > .field-content > a'):
|
|
|
|
href = a.get('href')
|
|
|
|
try:
|
|
|
|
download_gallery(href)
|
|
|
|
except Exception as e:
|
|
|
|
print(f"downloading gallery at '{href}' has failed: {str(e)}", file=stderr)
|