Extend script to download images as well

2023-03-30 16:45:48 +02:00 · 2023-03-30 16:45:48 +02:00 · f5af0f8d85
parent 61f679eb2d
commit f5af0f8d85
2 changed files with 40 additions and 1 deletions
--- a/bin/download_galleries.py
+++ b/bin/download_galleries.py
@ -1,7 +1,9 @@
 #!/usr/bin/env python3

 from bs4 import BeautifulSoup
+from datetime import datetime, timezone
 from markdownify import MarkdownConverter
+from os import utime
 from pathlib import Path
 from re import search, sub
 from sys import stderr
@ -33,6 +35,33 @@ def minify_whitespace(soup):
    else:
        soup.replace_with(sub(r'\s+', ' ', soup))

+def parse_http_datetime(dt):
+    return datetime.strptime(dt.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z')
+
+def download_image(url, target):
+    name = url.rsplit('/', 1)[-1]
+    image = target / name
+
+    if image.exists():
+        r = requests.head(url)
+        r.raise_for_status()
+
+        last_modified = parse_http_datetime(r.headers['last-modified'])
+        mtime = datetime.fromtimestamp(image.stat().st_mtime, tz=timezone.utc)
+        if last_modified <= mtime:
+            return
+
+    r = requests.get(url)
+    r.raise_for_status()
+
+    image.write_bytes(r.content)
+
+    last_modified = parse_http_datetime(r.headers['last-modified'])
+    timestamp = last_modified.timestamp()
+    utime(image, times=(timestamp, timestamp))
+
+    print(f"downloaded image '{name}' (last modified: {last_modified.strftime('%a, %d %b %Y %H:%M:%S %Z')})", file=stderr)
+
 def download_gallery(path):
    slug = path.removeprefix(f'{GALLERIES_PATH}/')

@ -65,6 +94,17 @@ date: {date}
 {content}
 ''')

+    picture = soup.find(class_='picture')
+    preview = search('https?://kontakt-bamberg.de[-./\w]+', picture.get('style'))
+    download_image(preview[0], target)
+
+    for a in soup.select('.field-name-field-pictures > .field-items > .field-item > a'):
+        href = a.get('href')
+        try:
+            download_image(href, target)
+        except Exception as e:
+            print(f"downloading image '{href}' has failed: {str(e)}", file=stderr)
+
 if __name__ == '__main__':
    r = requests.get(GALLERIES_URL)
    r.raise_for_status()
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,6 @@ certifi==2022.12.7
 charset-normalizer==3.1.0
 idna==3.4
 markdownify==0.11.6
-pathlib==1.0.1
 PyYAML==6.0
 requests==2.28.2
 six==1.16.0