From f5af0f8d8547b9658fb696ec310d83474ae740a8 Mon Sep 17 00:00:00 2001 From: Luca Date: Thu, 30 Mar 2023 16:45:48 +0200 Subject: [PATCH] Extend script to download images as well --- bin/download_galleries.py | 40 +++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 - 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/bin/download_galleries.py b/bin/download_galleries.py index 9a2dfa2..97ee9e7 100755 --- a/bin/download_galleries.py +++ b/bin/download_galleries.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 from bs4 import BeautifulSoup +from datetime import datetime, timezone from markdownify import MarkdownConverter +from os import utime from pathlib import Path from re import search, sub from sys import stderr @@ -33,6 +35,33 @@ def minify_whitespace(soup): else: soup.replace_with(sub(r'\s+', ' ', soup)) +def parse_http_datetime(dt): + return datetime.strptime(dt.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z') + +def download_image(url, target): + name = url.rsplit('/', 1)[-1] + image = target / name + + if image.exists(): + r = requests.head(url) + r.raise_for_status() + + last_modified = parse_http_datetime(r.headers['last-modified']) + mtime = datetime.fromtimestamp(image.stat().st_mtime, tz=timezone.utc) + if last_modified <= mtime: + return + + r = requests.get(url) + r.raise_for_status() + + image.write_bytes(r.content) + + last_modified = parse_http_datetime(r.headers['last-modified']) + timestamp = last_modified.timestamp() + utime(image, times=(timestamp, timestamp)) + + print(f"downloaded image '{name}' (last modified: {last_modified.strftime('%a, %d %b %Y %H:%M:%S %Z')})", file=stderr) + def download_gallery(path): slug = path.removeprefix(f'{GALLERIES_PATH}/') @@ -65,6 +94,17 @@ date: {date} {content} ''') + picture = soup.find(class_='picture') + preview = search('https?://kontakt-bamberg.de[-./\w]+', picture.get('style')) + download_image(preview[0], target) + + for a in soup.select('.field-name-field-pictures > .field-items > .field-item > a'): + href = a.get('href') + try: + download_image(href, target) + except Exception as e: + print(f"downloading image '{href}' has failed: {str(e)}", file=stderr) + if __name__ == '__main__': r = requests.get(GALLERIES_URL) r.raise_for_status() diff --git a/requirements.txt b/requirements.txt index ab2de36..5625cf6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ certifi==2022.12.7 charset-normalizer==3.1.0 idna==3.4 markdownify==0.11.6 -pathlib==1.0.1 PyYAML==6.0 requests==2.28.2 six==1.16.0