Extend script to download images as well
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Luca 2023-03-30 16:45:48 +02:00
parent 61f679eb2d
commit f5af0f8d85
2 changed files with 40 additions and 1 deletions

View File

@ -1,7 +1,9 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime, timezone
from markdownify import MarkdownConverter from markdownify import MarkdownConverter
from os import utime
from pathlib import Path from pathlib import Path
from re import search, sub from re import search, sub
from sys import stderr from sys import stderr
@ -33,6 +35,33 @@ def minify_whitespace(soup):
else: else:
soup.replace_with(sub(r'\s+', ' ', soup)) soup.replace_with(sub(r'\s+', ' ', soup))
def parse_http_datetime(dt):
return datetime.strptime(dt.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z')
def download_image(url, target):
name = url.rsplit('/', 1)[-1]
image = target / name
if image.exists():
r = requests.head(url)
r.raise_for_status()
last_modified = parse_http_datetime(r.headers['last-modified'])
mtime = datetime.fromtimestamp(image.stat().st_mtime, tz=timezone.utc)
if last_modified <= mtime:
return
r = requests.get(url)
r.raise_for_status()
image.write_bytes(r.content)
last_modified = parse_http_datetime(r.headers['last-modified'])
timestamp = last_modified.timestamp()
utime(image, times=(timestamp, timestamp))
print(f"downloaded image '{name}' (last modified: {last_modified.strftime('%a, %d %b %Y %H:%M:%S %Z')})", file=stderr)
def download_gallery(path): def download_gallery(path):
slug = path.removeprefix(f'{GALLERIES_PATH}/') slug = path.removeprefix(f'{GALLERIES_PATH}/')
@ -65,6 +94,17 @@ date: {date}
{content} {content}
''') ''')
picture = soup.find(class_='picture')
preview = search('https?://kontakt-bamberg.de[-./\w]+', picture.get('style'))
download_image(preview[0], target)
for a in soup.select('.field-name-field-pictures > .field-items > .field-item > a'):
href = a.get('href')
try:
download_image(href, target)
except Exception as e:
print(f"downloading image '{href}' has failed: {str(e)}", file=stderr)
if __name__ == '__main__': if __name__ == '__main__':
r = requests.get(GALLERIES_URL) r = requests.get(GALLERIES_URL)
r.raise_for_status() r.raise_for_status()

View File

@ -3,7 +3,6 @@ certifi==2022.12.7
charset-normalizer==3.1.0 charset-normalizer==3.1.0
idna==3.4 idna==3.4
markdownify==0.11.6 markdownify==0.11.6
pathlib==1.0.1
PyYAML==6.0 PyYAML==6.0
requests==2.28.2 requests==2.28.2
six==1.16.0 six==1.16.0