www.kontakt-bamberg.de/bin/download_galleries.py

#!/usr/bin/env python3

from bs4 import BeautifulSoup
from datetime import datetime, timezone
from markdownify import MarkdownConverter
from os import utime
from pathlib import Path
from re import search, sub
from sys import stderr
from urllib.parse import unquote
from yaml import CDumper, dump
import requests

BASE_URL = 'https://kontakt-bamberg.de'

GALLERIES_PATH = '/galerien'
GALLERIES_URL = f'{BASE_URL}{GALLERIES_PATH}'

TARGET_DIR = Path(__file__).resolve().parent.parent / 'content' / 'galerie'

DATE_PATTERN = r'([DFMS]\w+, \d{1,2}. [ADFJMNOS]\w+ \d{4})'
DATE_RANGE_SEPARATOR_PATTERN = r'bis\s+<.+>'
DATE_RANGE_PATTERN = f'{DATE_RANGE_SEPARATOR_PATTERN}{DATE_PATTERN}'

MONTH_NAMES = ('Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember')
MONTHS = {month: str(i+1).zfill(2) for i, month in enumerate(MONTH_NAMES)}

def convert_date(date):
    components = date.split()
    return f'{components[3]}-{MONTHS[components[2]]}-{components[1].rstrip(".").zfill(2)}'

def minify_whitespace(soup):
    if hasattr(soup, 'contents'):
        for child in soup.contents:
            minify_whitespace(child)
    else:
        soup.replace_with(sub(r'\s+', ' ', soup))

def parse_http_datetime(dt):
    return datetime.strptime(dt.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z')

def download_image(url, target):
    name = unquote(url.rsplit('/', 1)[-1])
    image = target / name

    if image.exists():
        r = requests.head(url)
        r.raise_for_status()

        last_modified = parse_http_datetime(r.headers['last-modified'])
        mtime = datetime.fromtimestamp(image.stat().st_mtime, tz=timezone.utc)
        if last_modified <= mtime:
            return

    r = requests.get(url)
    r.raise_for_status()

    image.write_bytes(r.content)

    last_modified = parse_http_datetime(r.headers['last-modified'])
    timestamp = last_modified.timestamp()
    utime(image, times=(timestamp, timestamp))

    print(f"downloaded image '{name}' (last modified: {last_modified.strftime('%a, %d %b %Y %H:%M:%S %Z')})", file=stderr)

def download_gallery(path):
    slug = unquote(path.removeprefix(f'{GALLERIES_PATH}/'))

    target = TARGET_DIR / slug
    target.mkdir(parents=True, exist_ok=True)

    r = requests.get(f'{BASE_URL}{path}')
    r.raise_for_status()

    date = search(DATE_RANGE_PATTERN, r.text)
    if date is None:
        date = search(DATE_PATTERN, r.text)
    date = convert_date(date[1])

    soup = BeautifulSoup(r.text, 'html.parser')
    minify_whitespace(soup)

    title = str(soup.find(id='page-title').string)

    md = MarkdownConverter()
    content = '\n\n'.join(md.convert_soup(p) for p in soup.select('.field-type-text-with-summary > .field-items > .field-item > p'))

    picture = soup.find(class_='picture')
    preview = search('https?://kontakt-bamberg.de[-./\w]+', picture.get('style'))

    index = target / 'index.md'
    index.write_text(f'''---
slug: "{slug}"
title: {dump(title, Dumper=CDumper).rstrip()}
date: {date}
preview: "{unquote(preview[0].rsplit('/', 1)[-1])}"
---

{content}
''')

    download_image(preview[0], target)

    for a in soup.select('.field-name-field-pictures > .field-items > .field-item > a'):
        href = a.get('href')
        try:
            download_image(href, target)
        except Exception as e:
            print(f"downloading image '{href}' has failed: {str(e)}", file=stderr)

if __name__ == '__main__':
    r = requests.get(GALLERIES_URL)
    r.raise_for_status()

    soup = BeautifulSoup(r.text, 'html.parser')
    for a in soup.select('.views-field-field-picture > .field-content > a'):
        href = a.get('href')
        try:
            download_gallery(href)
        except Exception as e:
            print(f"downloading gallery at '{href}' has failed: {str(e)}", file=stderr)
Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00			`#!/usr/bin/env python3`

			`from bs4 import BeautifulSoup`
Extend script to download images as well 2023-03-30 16:45:48 +02:00			`from datetime import datetime, timezone`
Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00			`from markdownify import MarkdownConverter`
Extend script to download images as well 2023-03-30 16:45:48 +02:00			`from os import utime`
Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00			`from pathlib import Path`
			`from re import search, sub`
			`from sys import stderr`
Unquote names from urls 2023-03-30 17:03:50 +02:00			`from urllib.parse import unquote`
Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00			`from yaml import CDumper, dump`
			`import requests`

			`BASE_URL = 'https://kontakt-bamberg.de'`

			`GALLERIES_PATH = '/galerien'`
			`GALLERIES_URL = f'{BASE_URL}{GALLERIES_PATH}'`

			`TARGET_DIR = Path(__file__).resolve().parent.parent / 'content' / 'galerie'`

Fix date range detection pattern 2023-03-30 01:19:28 +02:00			`DATE_PATTERN = r'([DFMS]\w+, \d{1,2}. [ADFJMNOS]\w+ \d{4})'`
			`DATE_RANGE_SEPARATOR_PATTERN = r'bis\s+<.+>'`
			`DATE_RANGE_PATTERN = f'{DATE_RANGE_SEPARATOR_PATTERN}{DATE_PATTERN}'`
Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00
			`MONTH_NAMES = ('Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember')`
			`MONTHS = {month: str(i+1).zfill(2) for i, month in enumerate(MONTH_NAMES)}`

			`def convert_date(date):`
			`components = date.split()`
			`return f'{components[3]}-{MONTHS[components[2]]}-{components[1].rstrip(".").zfill(2)}'`

			`def minify_whitespace(soup):`
			`if hasattr(soup, 'contents'):`
			`for child in soup.contents:`
			`minify_whitespace(child)`
			`else:`
			`soup.replace_with(sub(r'\s+', ' ', soup))`

Extend script to download images as well 2023-03-30 16:45:48 +02:00			`def parse_http_datetime(dt):`
			`return datetime.strptime(dt.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z')`

			`def download_image(url, target):`
Unquote names from urls 2023-03-30 17:03:50 +02:00			`name = unquote(url.rsplit('/', 1)[-1])`
Extend script to download images as well 2023-03-30 16:45:48 +02:00			`image = target / name`

			`if image.exists():`
			`r = requests.head(url)`
			`r.raise_for_status()`

			`last_modified = parse_http_datetime(r.headers['last-modified'])`
			`mtime = datetime.fromtimestamp(image.stat().st_mtime, tz=timezone.utc)`
			`if last_modified <= mtime:`
			`return`

			`r = requests.get(url)`
			`r.raise_for_status()`

			`image.write_bytes(r.content)`

			`last_modified = parse_http_datetime(r.headers['last-modified'])`
			`timestamp = last_modified.timestamp()`
			`utime(image, times=(timestamp, timestamp))`

			`print(f"downloaded image '{name}' (last modified: {last_modified.strftime('%a, %d %b %Y %H:%M:%S %Z')})", file=stderr)`

Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00			`def download_gallery(path):`
Unquote names from urls 2023-03-30 17:03:50 +02:00			`slug = unquote(path.removeprefix(f'{GALLERIES_PATH}/'))`
Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00
			`target = TARGET_DIR / slug`
			`target.mkdir(parents=True, exist_ok=True)`

			`r = requests.get(f'{BASE_URL}{path}')`
			`r.raise_for_status()`

			`date = search(DATE_RANGE_PATTERN, r.text)`
Fix date range detection pattern 2023-03-30 01:19:28 +02:00			`if date is None:`
			`date = search(DATE_PATTERN, r.text)`
			`date = convert_date(date[1])`
Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00
			`soup = BeautifulSoup(r.text, 'html.parser')`
			`minify_whitespace(soup)`

			`title = str(soup.find(id='page-title').string)`

			`md = MarkdownConverter()`
			`content = '\n\n'.join(md.convert_soup(p) for p in soup.select('.field-type-text-with-summary > .field-items > .field-item > p'))`

Use same gallery previews as current website 2023-04-01 16:36:54 +02:00			`picture = soup.find(class_='picture')`
			`preview = search('https?://kontakt-bamberg.de[-./\w]+', picture.get('style'))`

Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00			`index = target / 'index.md'`
			`index.write_text(f'''---`
			`slug: "{slug}"`
Strip trailing newlines from yaml-formatted title 2023-03-30 01:20:03 +02:00			`title: {dump(title, Dumper=CDumper).rstrip()}`
Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00			`date: {date}`
Use same gallery previews as current website 2023-04-01 16:36:54 +02:00			`preview: "{unquote(preview[0].rsplit('/', 1)[-1])}"`
Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00			`---`

			`{content}`
			`''')`

Extend script to download images as well 2023-03-30 16:45:48 +02:00			`download_image(preview[0], target)`

			`for a in soup.select('.field-name-field-pictures > .field-items > .field-item > a'):`
			`href = a.get('href')`
			`try:`
			`download_image(href, target)`
			`except Exception as e:`
			`print(f"downloading image '{href}' has failed: {str(e)}", file=stderr)`

Add script to download galleries (only text content for now) from current kontakt website 2023-03-30 00:52:40 +02:00			`if __name__ == '__main__':`
			`r = requests.get(GALLERIES_URL)`
			`r.raise_for_status()`

			`soup = BeautifulSoup(r.text, 'html.parser')`
			`for a in soup.select('.views-field-field-picture > .field-content > a'):`
			`href = a.get('href')`
			`try:`
			`download_gallery(href)`
			`except Exception as e:`
			`print(f"downloading gallery at '{href}' has failed: {str(e)}", file=stderr)`