Improve website loader logging

This commit is contained in:
Sascha Ißbrücker 2023-01-14 11:24:09 +01:00
parent 313a0ee99f
commit 4f9170c48d
2 changed files with 16 additions and 2 deletions

View file

@ -4,6 +4,7 @@ from dataclasses import dataclass
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from charset_normalizer import from_bytes from charset_normalizer import from_bytes
from django.utils import timezone
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -26,12 +27,20 @@ def load_website_metadata(url: str):
title = None title = None
description = None description = None
try: try:
start = timezone.now()
page_text = load_page(url) page_text = load_page(url)
end = timezone.now()
logger.debug(f'Load duration: {end - start}')
start = timezone.now()
soup = BeautifulSoup(page_text, 'html.parser') soup = BeautifulSoup(page_text, 'html.parser')
title = soup.title.string.strip() if soup.title is not None else None title = soup.title.string.strip() if soup.title is not None else None
description_tag = soup.find('meta', attrs={'name': 'description'}) description_tag = soup.find('meta', attrs={'name': 'description'})
description = description = description_tag['content'].strip() if description_tag and description_tag['content'] else None description = description = description_tag['content'].strip() if description_tag and description_tag[
'content'] else None
end = timezone.now()
logger.debug(f'Parsing duration: {end - start}')
finally: finally:
return WebsiteMetadata(url=url, title=title, description=description) return WebsiteMetadata(url=url, title=title, description=description)
@ -44,15 +53,19 @@ def load_page(url: str):
headers = fake_request_headers() headers = fake_request_headers()
size = 0 size = 0
content = None content = None
iteration = 0
# Use with to ensure request gets closed even if it's only read partially # Use with to ensure request gets closed even if it's only read partially
with requests.get(url, timeout=10, headers=headers, stream=True) as r: with requests.get(url, timeout=10, headers=headers, stream=True) as r:
for chunk in r.iter_content(chunk_size=CHUNK_SIZE): for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
size += len(chunk) size += len(chunk)
iteration = iteration + 1
if content is None: if content is None:
content = chunk content = chunk
else: else:
content = content + chunk content = content + chunk
logger.debug(f'Loaded chunk (iteration={iteration}, total={size / 1024})')
# Stop reading if we have parsed end of head tag # Stop reading if we have parsed end of head tag
if '</head>'.encode('utf-8') in content: if '</head>'.encode('utf-8') in content:
logger.debug(f'Found closing head tag after {size} bytes') logger.debug(f'Found closing head tag after {size} bytes')
@ -61,6 +74,7 @@ def load_page(url: str):
if size > MAX_CONTENT_LIMIT: if size > MAX_CONTENT_LIMIT:
logger.debug(f'Cancel reading document after {size} bytes') logger.debug(f'Cancel reading document after {size} bytes')
break break
logger.debug(f'Request consumed: {r._content_consumed}')
# Use charset_normalizer to determine encoding that best matches the response content # Use charset_normalizer to determine encoding that best matches the response content
# Several sites seem to specify the response encoding incorrectly, so we ignore it and use custom logic instead # Several sites seem to specify the response encoding incorrectly, so we ignore it and use custom logic instead

View file

@ -25,7 +25,7 @@ LOGGING = {
'disable_existing_loggers': False, 'disable_existing_loggers': False,
'formatters': { 'formatters': {
'simple': { 'simple': {
'format': '{levelname} {message}', 'format': '{levelname} {asctime} {module}: {message}',
'style': '{', 'style': '{',
}, },
}, },