use urllib for url parsing instead of hand written string commands

This commit is contained in:
Nick Sweeting 2019-02-19 01:45:19 -05:00
parent 8576a2f061
commit 2c9aad559d

View file

@ -4,7 +4,8 @@ import sys
import time import time
import json import json
import signal import signal
import urllib.request from urllib.request import urlopen
from urllib.parse import urlparse
from decimal import Decimal from decimal import Decimal
from urllib.parse import quote from urllib.parse import quote
@ -33,12 +34,18 @@ from config import (
SUBMIT_ARCHIVE_DOT_ORG, SUBMIT_ARCHIVE_DOT_ORG,
) )
# URL helpers # URL helpers: https://docs.python.org/3/library/urllib.parse.html#url-parsing
without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '') scheme = lambda url: urlparse(url).scheme
without_query = lambda url: url.split('?', 1)[0] without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
without_hash = lambda url: url.split('#', 1)[0] without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
without_path = lambda url: url.split('/', 1)[0] without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
domain = lambda url: without_hash(without_query(without_path(without_scheme(url)))) without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
path = lambda url: urlparse(url).path
basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
domain = lambda url: urlparse(url).netloc
query = lambda url: urlparse(url).query
fragment = lambda url: urlparse(url).fragment
extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
short_ts = lambda ts: ts.split('.')[0] short_ts = lambda ts: ts.split('.')[0]
@ -214,7 +221,7 @@ def download_url(url):
)) ))
end = progress(TIMEOUT, prefix=' ') end = progress(TIMEOUT, prefix=' ')
try: try:
downloaded_xml = urllib.request.urlopen(url).read().decode('utf-8') downloaded_xml = urlopen(url).read().decode('utf-8')
end() end()
except Exception as e: except Exception as e:
end() end()
@ -234,9 +241,10 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
if progress: if progress:
sys.stdout.write('.') sys.stdout.write('.')
sys.stdout.flush() sys.stdout.flush()
html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
match = re.search('<title>(.*?)</title>', html_content) html_content = urlopen(url, timeout=timeout).read().decode('utf-8')
return match.group(1) if match else default or None match = re.search(HTML_TITLE_REGEX, html_content)
return match.group(1).strip() if match else None
except Exception: except Exception:
return None return None