mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 06:34:16 +00:00
add regex fallback back to title parser
This commit is contained in:
parent
79bef1384e
commit
f727ece7b3
1 changed files with 23 additions and 5 deletions
|
@ -25,6 +25,14 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
||||
HTML_TITLE_REGEX = re.compile(
|
||||
r'<title.*?>' # start matching text after <title> tag
|
||||
r'(.[^<>]+)', # get everything up to these symbols
|
||||
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
class TitleParser(HTMLParser):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
@ -84,12 +92,22 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
|||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
html = download_url(link.url, timeout=timeout)
|
||||
parser = TitleParser()
|
||||
parser.feed(html)
|
||||
output = parser.title
|
||||
if output:
|
||||
try:
|
||||
# try using relatively strict html parser first
|
||||
parser = TitleParser()
|
||||
parser.feed(html)
|
||||
output = parser.title
|
||||
except Exception:
|
||||
# fallback to regex that can handle broken/malformed html
|
||||
match = re.search(HTML_TITLE_REGEX, html)
|
||||
output = htmldecode(match.group(1).strip()) if match else None
|
||||
|
||||
# if title is better than the one in the db, update db with new title
|
||||
if isinstance(output, str) and output:
|
||||
if not link.title or len(output) >= len(link.title):
|
||||
Snapshot.objects.filter(url=link.url, timestamp=link.timestamp).update(title=output)
|
||||
Snapshot.objects.filter(url=link.url,
|
||||
timestamp=link.timestamp)\
|
||||
.update(title=output)
|
||||
else:
|
||||
raise ArchiveError('Unable to detect page title')
|
||||
except Exception as err:
|
||||
|
|
Loading…
Reference in a new issue