mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-23 04:33:11 +00:00
use urllib for url parsing instead of hand written string commands
This commit is contained in:
parent
8576a2f061
commit
2c9aad559d
1 changed files with 19 additions and 11 deletions
|
@ -4,7 +4,8 @@ import sys
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
import signal
|
import signal
|
||||||
import urllib.request
|
from urllib.request import urlopen
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
@ -33,12 +34,18 @@ from config import (
|
||||||
SUBMIT_ARCHIVE_DOT_ORG,
|
SUBMIT_ARCHIVE_DOT_ORG,
|
||||||
)
|
)
|
||||||
|
|
||||||
# URL helpers
|
# URL helpers: https://docs.python.org/3/library/urllib.parse.html#url-parsing
|
||||||
without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '')
|
scheme = lambda url: urlparse(url).scheme
|
||||||
without_query = lambda url: url.split('?', 1)[0]
|
without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
|
||||||
without_hash = lambda url: url.split('#', 1)[0]
|
without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
|
||||||
without_path = lambda url: url.split('/', 1)[0]
|
without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
|
||||||
domain = lambda url: without_hash(without_query(without_path(without_scheme(url))))
|
without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
|
||||||
|
path = lambda url: urlparse(url).path
|
||||||
|
basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
|
||||||
|
domain = lambda url: urlparse(url).netloc
|
||||||
|
query = lambda url: urlparse(url).query
|
||||||
|
fragment = lambda url: urlparse(url).fragment
|
||||||
|
extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
|
||||||
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
||||||
|
|
||||||
short_ts = lambda ts: ts.split('.')[0]
|
short_ts = lambda ts: ts.split('.')[0]
|
||||||
|
@ -214,7 +221,7 @@ def download_url(url):
|
||||||
))
|
))
|
||||||
end = progress(TIMEOUT, prefix=' ')
|
end = progress(TIMEOUT, prefix=' ')
|
||||||
try:
|
try:
|
||||||
downloaded_xml = urllib.request.urlopen(url).read().decode('utf-8')
|
downloaded_xml = urlopen(url).read().decode('utf-8')
|
||||||
end()
|
end()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
end()
|
end()
|
||||||
|
@ -234,9 +241,10 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
|
||||||
if progress:
|
if progress:
|
||||||
sys.stdout.write('.')
|
sys.stdout.write('.')
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
|
|
||||||
match = re.search('<title>(.*?)</title>', html_content)
|
html_content = urlopen(url, timeout=timeout).read().decode('utf-8')
|
||||||
return match.group(1) if match else default or None
|
match = re.search(HTML_TITLE_REGEX, html_content)
|
||||||
|
return match.group(1).strip() if match else None
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue