www.kontakt-bamberg.de/bin/download_galleries.py

122 lines
3.8 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from markdownify import MarkdownConverter
from os import utime
from pathlib import Path
from re import search, sub
from sys import stderr
2023-03-30 17:03:50 +02:00
from urllib.parse import unquote
from yaml import CDumper, dump
import requests
BASE_URL = 'https://kontakt-bamberg.de'
GALLERIES_PATH = '/galerien'
GALLERIES_URL = f'{BASE_URL}{GALLERIES_PATH}'
TARGET_DIR = Path(__file__).resolve().parent.parent / 'content' / 'galerie'
2023-03-30 01:19:28 +02:00
DATE_PATTERN = r'([DFMS]\w+, \d{1,2}. [ADFJMNOS]\w+ \d{4})'
DATE_RANGE_SEPARATOR_PATTERN = r'bis\s+<.+>'
DATE_RANGE_PATTERN = f'{DATE_RANGE_SEPARATOR_PATTERN}{DATE_PATTERN}'
MONTH_NAMES = ('Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember')
MONTHS = {month: str(i+1).zfill(2) for i, month in enumerate(MONTH_NAMES)}
def convert_date(date):
components = date.split()
return f'{components[3]}-{MONTHS[components[2]]}-{components[1].rstrip(".").zfill(2)}'
def minify_whitespace(soup):
if hasattr(soup, 'contents'):
for child in soup.contents:
minify_whitespace(child)
else:
soup.replace_with(sub(r'\s+', ' ', soup))
def parse_http_datetime(dt):
return datetime.strptime(dt.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z')
def download_image(url, target):
2023-03-30 17:03:50 +02:00
name = unquote(url.rsplit('/', 1)[-1])
image = target / name
if image.exists():
r = requests.head(url)
r.raise_for_status()
last_modified = parse_http_datetime(r.headers['last-modified'])
mtime = datetime.fromtimestamp(image.stat().st_mtime, tz=timezone.utc)
if last_modified <= mtime:
return
r = requests.get(url)
r.raise_for_status()
image.write_bytes(r.content)
last_modified = parse_http_datetime(r.headers['last-modified'])
timestamp = last_modified.timestamp()
utime(image, times=(timestamp, timestamp))
print(f"downloaded image '{name}' (last modified: {last_modified.strftime('%a, %d %b %Y %H:%M:%S %Z')})", file=stderr)
def download_gallery(path):
2023-03-30 17:03:50 +02:00
slug = unquote(path.removeprefix(f'{GALLERIES_PATH}/'))
target = TARGET_DIR / slug
target.mkdir(parents=True, exist_ok=True)
r = requests.get(f'{BASE_URL}{path}')
r.raise_for_status()
date = search(DATE_RANGE_PATTERN, r.text)
2023-03-30 01:19:28 +02:00
if date is None:
date = search(DATE_PATTERN, r.text)
date = convert_date(date[1])
soup = BeautifulSoup(r.text, 'html.parser')
minify_whitespace(soup)
title = str(soup.find(id='page-title').string)
md = MarkdownConverter()
content = '\n\n'.join(md.convert_soup(p) for p in soup.select('.field-type-text-with-summary > .field-items > .field-item > p'))
picture = soup.find(class_='picture')
preview = search('https?://kontakt-bamberg.de[-./\w]+', picture.get('style'))
index = target / 'index.md'
index.write_text(f'''---
slug: "{slug}"
title: {dump(title, Dumper=CDumper).rstrip()}
date: {date}
preview: "{unquote(preview[0].rsplit('/', 1)[-1])}"
---
{content}
''')
download_image(preview[0], target)
for a in soup.select('.field-name-field-pictures > .field-items > .field-item > a'):
href = a.get('href')
try:
download_image(href, target)
except Exception as e:
print(f"downloading image '{href}' has failed: {str(e)}", file=stderr)
if __name__ == '__main__':
r = requests.get(GALLERIES_URL)
r.raise_for_status()
soup = BeautifulSoup(r.text, 'html.parser')
for a in soup.select('.views-field-field-picture > .field-content > a'):
href = a.get('href')
try:
download_gallery(href)
except Exception as e:
print(f"downloading gallery at '{href}' has failed: {str(e)}", file=stderr)