diff --git a/bookmarks/services/website_loader.py b/bookmarks/services/website_loader.py index 9835456..270bca2 100644 --- a/bookmarks/services/website_loader.py +++ b/bookmarks/services/website_loader.py @@ -4,6 +4,7 @@ from dataclasses import dataclass import requests from bs4 import BeautifulSoup from charset_normalizer import from_bytes +from django.utils import timezone logger = logging.getLogger(__name__) @@ -26,12 +27,20 @@ def load_website_metadata(url: str): title = None description = None try: + start = timezone.now() page_text = load_page(url) + end = timezone.now() + logger.debug(f'Load duration: {end - start}') + + start = timezone.now() soup = BeautifulSoup(page_text, 'html.parser') title = soup.title.string.strip() if soup.title is not None else None description_tag = soup.find('meta', attrs={'name': 'description'}) - description = description = description_tag['content'].strip() if description_tag and description_tag['content'] else None + description = description = description_tag['content'].strip() if description_tag and description_tag[ + 'content'] else None + end = timezone.now() + logger.debug(f'Parsing duration: {end - start}') finally: return WebsiteMetadata(url=url, title=title, description=description) @@ -44,15 +53,19 @@ def load_page(url: str): headers = fake_request_headers() size = 0 content = None + iteration = 0 # Use with to ensure request gets closed even if it's only read partially with requests.get(url, timeout=10, headers=headers, stream=True) as r: for chunk in r.iter_content(chunk_size=CHUNK_SIZE): size += len(chunk) + iteration = iteration + 1 if content is None: content = chunk else: content = content + chunk + logger.debug(f'Loaded chunk (iteration={iteration}, total={size / 1024})') + # Stop reading if we have parsed end of head tag if ''.encode('utf-8') in content: logger.debug(f'Found closing head tag after {size} bytes') @@ -61,6 +74,7 @@ def load_page(url: str): if size > MAX_CONTENT_LIMIT: logger.debug(f'Cancel reading document after {size} bytes') break + logger.debug(f'Request consumed: {r._content_consumed}') # Use charset_normalizer to determine encoding that best matches the response content # Several sites seem to specify the response encoding incorrectly, so we ignore it and use custom logic instead diff --git a/siteroot/settings/dev.py b/siteroot/settings/dev.py index 79e8443..3e85d8f 100644 --- a/siteroot/settings/dev.py +++ b/siteroot/settings/dev.py @@ -25,7 +25,7 @@ LOGGING = { 'disable_existing_loggers': False, 'formatters': { 'simple': { - 'format': '{levelname} {message}', + 'format': '{levelname} {asctime} {module}: {message}', 'style': '{', }, },