mirror of
https://github.com/sissbruecker/linkding
synced 2024-11-10 06:04:15 +00:00
Improve website loader logging
This commit is contained in:
parent
313a0ee99f
commit
4f9170c48d
2 changed files with 16 additions and 2 deletions
|
@ -4,6 +4,7 @@ from dataclasses import dataclass
|
|||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from charset_normalizer import from_bytes
|
||||
from django.utils import timezone
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -26,12 +27,20 @@ def load_website_metadata(url: str):
|
|||
title = None
|
||||
description = None
|
||||
try:
|
||||
start = timezone.now()
|
||||
page_text = load_page(url)
|
||||
end = timezone.now()
|
||||
logger.debug(f'Load duration: {end - start}')
|
||||
|
||||
start = timezone.now()
|
||||
soup = BeautifulSoup(page_text, 'html.parser')
|
||||
|
||||
title = soup.title.string.strip() if soup.title is not None else None
|
||||
description_tag = soup.find('meta', attrs={'name': 'description'})
|
||||
description = description = description_tag['content'].strip() if description_tag and description_tag['content'] else None
|
||||
description = description = description_tag['content'].strip() if description_tag and description_tag[
|
||||
'content'] else None
|
||||
end = timezone.now()
|
||||
logger.debug(f'Parsing duration: {end - start}')
|
||||
finally:
|
||||
return WebsiteMetadata(url=url, title=title, description=description)
|
||||
|
||||
|
@ -44,15 +53,19 @@ def load_page(url: str):
|
|||
headers = fake_request_headers()
|
||||
size = 0
|
||||
content = None
|
||||
iteration = 0
|
||||
# Use with to ensure request gets closed even if it's only read partially
|
||||
with requests.get(url, timeout=10, headers=headers, stream=True) as r:
|
||||
for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
|
||||
size += len(chunk)
|
||||
iteration = iteration + 1
|
||||
if content is None:
|
||||
content = chunk
|
||||
else:
|
||||
content = content + chunk
|
||||
|
||||
logger.debug(f'Loaded chunk (iteration={iteration}, total={size / 1024})')
|
||||
|
||||
# Stop reading if we have parsed end of head tag
|
||||
if '</head>'.encode('utf-8') in content:
|
||||
logger.debug(f'Found closing head tag after {size} bytes')
|
||||
|
@ -61,6 +74,7 @@ def load_page(url: str):
|
|||
if size > MAX_CONTENT_LIMIT:
|
||||
logger.debug(f'Cancel reading document after {size} bytes')
|
||||
break
|
||||
logger.debug(f'Request consumed: {r._content_consumed}')
|
||||
|
||||
# Use charset_normalizer to determine encoding that best matches the response content
|
||||
# Several sites seem to specify the response encoding incorrectly, so we ignore it and use custom logic instead
|
||||
|
|
|
@ -25,7 +25,7 @@ LOGGING = {
|
|||
'disable_existing_loggers': False,
|
||||
'formatters': {
|
||||
'simple': {
|
||||
'format': '{levelname} {message}',
|
||||
'format': '{levelname} {asctime} {module}: {message}',
|
||||
'style': '{',
|
||||
},
|
||||
},
|
||||
|
|
Loading…
Reference in a new issue