Improve website loader logging

2024-09-20 05:51:56 +00:00 · 2023-01-14 11:24:09 +01:00 · 2023-01-14 11:24:09 +01:00 · 4f9170c48d
commit 4f9170c48d
parent 313a0ee99f
2 changed files with 16 additions and 2 deletions
--- a/bookmarks/services/website_loader.py
+++ b/bookmarks/services/website_loader.py
@ -4,6 +4,7 @@ from dataclasses import dataclass
 import requests
 from bs4 import BeautifulSoup
 from charset_normalizer import from_bytes
 from django.utils import timezone
 logger = logging.getLogger(__name__)
@ -26,12 +27,20 @@ def load_website_metadata(url: str):
    title = None
    description = None
    try:
        start = timezone.now()
        page_text = load_page(url)
        end = timezone.now()
        logger.debug(f'Load duration: {end - start}')
        start = timezone.now()
        soup = BeautifulSoup(page_text, 'html.parser')
        title = soup.title.string.strip() if soup.title is not None else None
        description_tag = soup.find('meta', attrs={'name': 'description'})
-        description = description = description_tag['content'].strip() if description_tag and description_tag['content'] else None
+        description = description = description_tag['content'].strip() if description_tag and description_tag[
            'content'] else None
        end = timezone.now()
        logger.debug(f'Parsing duration: {end - start}')
    finally:
        return WebsiteMetadata(url=url, title=title, description=description)
@ -44,15 +53,19 @@ def load_page(url: str):
    headers = fake_request_headers()
    size = 0
    content = None
    iteration = 0
    # Use with to ensure request gets closed even if it's only read partially
    with requests.get(url, timeout=10, headers=headers, stream=True) as r:
        for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
            size += len(chunk)
            iteration = iteration + 1
            if content is None:
                content = chunk
            else:
                content = content + chunk
            logger.debug(f'Loaded chunk (iteration={iteration}, total={size / 1024})')
            # Stop reading if we have parsed end of head tag
            if '</head>'.encode('utf-8') in content:
                logger.debug(f'Found closing head tag after {size} bytes')
@ -61,6 +74,7 @@ def load_page(url: str):
            if size > MAX_CONTENT_LIMIT:
                logger.debug(f'Cancel reading document after {size} bytes')
                break
        logger.debug(f'Request consumed: {r._content_consumed}')
    # Use charset_normalizer to determine encoding that best matches the response content
    # Several sites seem to specify the response encoding incorrectly, so we ignore it and use custom logic instead
--- a/siteroot/settings/dev.py
+++ b/siteroot/settings/dev.py
@ -25,7 +25,7 @@ LOGGING = {
    'disable_existing_loggers': False,
    'formatters': {
        'simple': {
-            'format': '{levelname} {message}',
+            'format': '{levelname} {asctime} {module}: {message}',
            'style': '{',
        },
    },