new compiled URL regex with better markdown support

This commit is contained in:
Nick Sweeting 2019-02-27 04:49:25 -05:00
parent b2c22a73e6
commit ef4c446c8b
2 changed files with 40 additions and 3 deletions

View file

@ -27,12 +27,15 @@ from util import (
str_between,
get_link_type,
URL_REGEX,
check_url_parsing,
)
def parse_links(path):
"""parse a list of links dictionaries from a bookmark export file"""
check_url_parsing()
links = []
with open(path, 'r', encoding='utf-8') as file:
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
@ -192,7 +195,6 @@ def parse_shaarli_rss_export(rss_file):
yield info
def parse_netscape_html_export(html_file):
"""Parse netscape-format bookmarks export files (produced by all browsers)"""

View file

@ -58,8 +58,19 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
short_ts = lambda ts: ts.split('.')[0]
URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+'
HTML_TITLE_REGEX = '<title>(.[^<>]+)'
URL_REGEX = re.compile(
r'http[s]?://' # start matching from allowed schemes
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
re.IGNORECASE,
)
HTML_TITLE_REGEX = re.compile(
r'<title>' # start matching text after <title> tag
r'(.[^<>]+)', # get everything up to these symbols
re.IGNORECASE,
)
def check_dependencies():
@ -124,6 +135,30 @@ def check_dependencies():
raise SystemExit(1)
def check_url_parsing():
"""Check that plain text regex URL parsing works as expected"""
test_urls = '''
https://example1.com/what/is/happening.html?what=1#how-about-this=1
https://example2.com/what/is/happening/?what=1#how-about-this=1
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
https://example4.com/what/is/happening.html
https://example5.com/
https://example6.com
<test>http://example7.com</test>
[https://example8.com/what/is/this.php?what=1]
[and http://example9.com?what=1&other=3#and-thing=2]
<what>https://example10.com#and-thing=2 "</about>
abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
example13.bada
and example14.badb
<or>htt://example15.badc</that>
'''
# print('\n'.join(re.findall(URL_REGEX, test_urls)))
assert len(re.findall(URL_REGEX, test_urls)) == 12
def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
"""chmod -R <permissions> <cwd>/<path>"""