Improve website loader logging

This commit is contained in:
Sascha Ißbrücker 2023-01-14 11:24:09 +01:00
parent 313a0ee99f
commit 4f9170c48d
2 changed files with 16 additions and 2 deletions

View file

@ -4,6 +4,7 @@ from dataclasses import dataclass
import requests
from bs4 import BeautifulSoup
from charset_normalizer import from_bytes
from django.utils import timezone
logger = logging.getLogger(__name__)
@ -26,12 +27,20 @@ def load_website_metadata(url: str):
title = None
description = None
try:
start = timezone.now()
page_text = load_page(url)
end = timezone.now()
logger.debug(f'Load duration: {end - start}')
start = timezone.now()
soup = BeautifulSoup(page_text, 'html.parser')
title = soup.title.string.strip() if soup.title is not None else None
description_tag = soup.find('meta', attrs={'name': 'description'})
description = description = description_tag['content'].strip() if description_tag and description_tag['content'] else None
description = description = description_tag['content'].strip() if description_tag and description_tag[
'content'] else None
end = timezone.now()
logger.debug(f'Parsing duration: {end - start}')
finally:
return WebsiteMetadata(url=url, title=title, description=description)
@ -44,15 +53,19 @@ def load_page(url: str):
headers = fake_request_headers()
size = 0
content = None
iteration = 0
# Use with to ensure request gets closed even if it's only read partially
with requests.get(url, timeout=10, headers=headers, stream=True) as r:
for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
size += len(chunk)
iteration = iteration + 1
if content is None:
content = chunk
else:
content = content + chunk
logger.debug(f'Loaded chunk (iteration={iteration}, total={size / 1024})')
# Stop reading if we have parsed end of head tag
if '</head>'.encode('utf-8') in content:
logger.debug(f'Found closing head tag after {size} bytes')
@ -61,6 +74,7 @@ def load_page(url: str):
if size > MAX_CONTENT_LIMIT:
logger.debug(f'Cancel reading document after {size} bytes')
break
logger.debug(f'Request consumed: {r._content_consumed}')
# Use charset_normalizer to determine encoding that best matches the response content
# Several sites seem to specify the response encoding incorrectly, so we ignore it and use custom logic instead

View file

@ -25,7 +25,7 @@ LOGGING = {
'disable_existing_loggers': False,
'formatters': {
'simple': {
'format': '{levelname} {message}',
'format': '{levelname} {asctime} {module}: {message}',
'style': '{',
},
},