Extend script to download images as well

2023-03-30 16:45:48 +02:00 · 2023-03-30 16:45:48 +02:00 · f5af0f8d85
parent 61f679eb2d
commit f5af0f8d85
2 changed files with 40 additions and 1 deletions
--- a/bin/download_galleries.py
+++ b/bin/download_galleries.py
@ -1,7 +1,9 @@
 #!/usr/bin/env python3
 from bs4 import BeautifulSoup
 from datetime import datetime, timezone
 from markdownify import MarkdownConverter
 from os import utime
 from pathlib import Path
 from re import search, sub
 from sys import stderr
@ -33,6 +35,33 @@ def minify_whitespace(soup):
    else:
        soup.replace_with(sub(r'\s+', ' ', soup))
 def parse_http_datetime(dt):
    return datetime.strptime(dt.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z')
 def download_image(url, target):
    name = url.rsplit('/', 1)[-1]
    image = target / name
    if image.exists():
        r = requests.head(url)
        r.raise_for_status()
        last_modified = parse_http_datetime(r.headers['last-modified'])
        mtime = datetime.fromtimestamp(image.stat().st_mtime, tz=timezone.utc)
        if last_modified <= mtime:
            return
    r = requests.get(url)
    r.raise_for_status()
    image.write_bytes(r.content)
    last_modified = parse_http_datetime(r.headers['last-modified'])
    timestamp = last_modified.timestamp()
    utime(image, times=(timestamp, timestamp))
    print(f"downloaded image '{name}' (last modified: {last_modified.strftime('%a, %d %b %Y %H:%M:%S %Z')})", file=stderr)
 def download_gallery(path):
    slug = path.removeprefix(f'{GALLERIES_PATH}/')
@ -65,6 +94,17 @@ date: {date}
 {content}
 ''')
    picture = soup.find(class_='picture')
    preview = search('https?://kontakt-bamberg.de[-./\w]+', picture.get('style'))
    download_image(preview[0], target)
    for a in soup.select('.field-name-field-pictures > .field-items > .field-item > a'):
        href = a.get('href')
        try:
            download_image(href, target)
        except Exception as e:
            print(f"downloading image '{href}' has failed: {str(e)}", file=stderr)
 if __name__ == '__main__':
    r = requests.get(GALLERIES_URL)
    r.raise_for_status()
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,6 @@ certifi==2022.12.7
 charset-normalizer==3.1.0
 idna==3.4
 markdownify==0.11.6
 pathlib==1.0.1
 PyYAML==6.0
 requests==2.28.2
 six==1.16.0