From f5af0f8d8547b9658fb696ec310d83474ae740a8 Mon Sep 17 00:00:00 2001
From: Luca <Luca@hackerspace-bamberg.de>
Date: Thu, 30 Mar 2023 16:45:48 +0200
Subject: [PATCH] Extend script to download images as well

---
 bin/download_galleries.py | 40 +++++++++++++++++++++++++++++++++++++++
 requirements.txt          |  1 -
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/bin/download_galleries.py b/bin/download_galleries.py
index 9a2dfa2..97ee9e7 100755
--- a/bin/download_galleries.py
+++ b/bin/download_galleries.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python3
 
 from bs4 import BeautifulSoup
+from datetime import datetime, timezone
 from markdownify import MarkdownConverter
+from os import utime
 from pathlib import Path
 from re import search, sub
 from sys import stderr
@@ -33,6 +35,33 @@ def minify_whitespace(soup):
     else:
         soup.replace_with(sub(r'\s+', ' ', soup))
 
+def parse_http_datetime(dt):
+    return datetime.strptime(dt.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z')
+
+def download_image(url, target):
+    name = url.rsplit('/', 1)[-1]
+    image = target / name
+
+    if image.exists():
+        r = requests.head(url)
+        r.raise_for_status()
+
+        last_modified = parse_http_datetime(r.headers['last-modified'])
+        mtime = datetime.fromtimestamp(image.stat().st_mtime, tz=timezone.utc)
+        if last_modified <= mtime:
+            return
+
+    r = requests.get(url)
+    r.raise_for_status()
+
+    image.write_bytes(r.content)
+
+    last_modified = parse_http_datetime(r.headers['last-modified'])
+    timestamp = last_modified.timestamp()
+    utime(image, times=(timestamp, timestamp))
+
+    print(f"downloaded image '{name}' (last modified: {last_modified.strftime('%a, %d %b %Y %H:%M:%S %Z')})", file=stderr)
+
 def download_gallery(path):
     slug = path.removeprefix(f'{GALLERIES_PATH}/')
 
@@ -65,6 +94,17 @@ date: {date}
 {content}
 ''')
 
+    picture = soup.find(class_='picture')
+    preview = search('https?://kontakt-bamberg.de[-./\w]+', picture.get('style'))
+    download_image(preview[0], target)
+
+    for a in soup.select('.field-name-field-pictures > .field-items > .field-item > a'):
+        href = a.get('href')
+        try:
+            download_image(href, target)
+        except Exception as e:
+            print(f"downloading image '{href}' has failed: {str(e)}", file=stderr)
+
 if __name__ == '__main__':
     r = requests.get(GALLERIES_URL)
     r.raise_for_status()
diff --git a/requirements.txt b/requirements.txt
index ab2de36..5625cf6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,6 @@ certifi==2022.12.7
 charset-normalizer==3.1.0
 idna==3.4
 markdownify==0.11.6
-pathlib==1.0.1
 PyYAML==6.0
 requests==2.28.2
 six==1.16.0