Extend script to download images as well
continuous-integration/drone/push Build is passing
Details
continuous-integration/drone/push Build is passing
Details
This commit is contained in:
parent
61f679eb2d
commit
f5af0f8d85
|
@ -1,7 +1,9 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from datetime import datetime, timezone
|
||||||
from markdownify import MarkdownConverter
|
from markdownify import MarkdownConverter
|
||||||
|
from os import utime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from re import search, sub
|
from re import search, sub
|
||||||
from sys import stderr
|
from sys import stderr
|
||||||
|
@ -33,6 +35,33 @@ def minify_whitespace(soup):
|
||||||
else:
|
else:
|
||||||
soup.replace_with(sub(r'\s+', ' ', soup))
|
soup.replace_with(sub(r'\s+', ' ', soup))
|
||||||
|
|
||||||
|
def parse_http_datetime(dt):
|
||||||
|
return datetime.strptime(dt.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z')
|
||||||
|
|
||||||
|
def download_image(url, target):
|
||||||
|
name = url.rsplit('/', 1)[-1]
|
||||||
|
image = target / name
|
||||||
|
|
||||||
|
if image.exists():
|
||||||
|
r = requests.head(url)
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
last_modified = parse_http_datetime(r.headers['last-modified'])
|
||||||
|
mtime = datetime.fromtimestamp(image.stat().st_mtime, tz=timezone.utc)
|
||||||
|
if last_modified <= mtime:
|
||||||
|
return
|
||||||
|
|
||||||
|
r = requests.get(url)
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
image.write_bytes(r.content)
|
||||||
|
|
||||||
|
last_modified = parse_http_datetime(r.headers['last-modified'])
|
||||||
|
timestamp = last_modified.timestamp()
|
||||||
|
utime(image, times=(timestamp, timestamp))
|
||||||
|
|
||||||
|
print(f"downloaded image '{name}' (last modified: {last_modified.strftime('%a, %d %b %Y %H:%M:%S %Z')})", file=stderr)
|
||||||
|
|
||||||
def download_gallery(path):
|
def download_gallery(path):
|
||||||
slug = path.removeprefix(f'{GALLERIES_PATH}/')
|
slug = path.removeprefix(f'{GALLERIES_PATH}/')
|
||||||
|
|
||||||
|
@ -65,6 +94,17 @@ date: {date}
|
||||||
{content}
|
{content}
|
||||||
''')
|
''')
|
||||||
|
|
||||||
|
picture = soup.find(class_='picture')
|
||||||
|
preview = search('https?://kontakt-bamberg.de[-./\w]+', picture.get('style'))
|
||||||
|
download_image(preview[0], target)
|
||||||
|
|
||||||
|
for a in soup.select('.field-name-field-pictures > .field-items > .field-item > a'):
|
||||||
|
href = a.get('href')
|
||||||
|
try:
|
||||||
|
download_image(href, target)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"downloading image '{href}' has failed: {str(e)}", file=stderr)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
r = requests.get(GALLERIES_URL)
|
r = requests.get(GALLERIES_URL)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
|
|
@ -3,7 +3,6 @@ certifi==2022.12.7
|
||||||
charset-normalizer==3.1.0
|
charset-normalizer==3.1.0
|
||||||
idna==3.4
|
idna==3.4
|
||||||
markdownify==0.11.6
|
markdownify==0.11.6
|
||||||
pathlib==1.0.1
|
|
||||||
PyYAML==6.0
|
PyYAML==6.0
|
||||||
requests==2.28.2
|
requests==2.28.2
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
|
|
Loading…
Reference in New Issue