mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-26 14:10:20 +00:00
new compiled URL regex with better markdown support
This commit is contained in:
parent
b2c22a73e6
commit
ef4c446c8b
2 changed files with 40 additions and 3 deletions
|
@ -27,12 +27,15 @@ from util import (
|
|||
str_between,
|
||||
get_link_type,
|
||||
URL_REGEX,
|
||||
check_url_parsing,
|
||||
)
|
||||
|
||||
|
||||
def parse_links(path):
|
||||
"""parse a list of links dictionaries from a bookmark export file"""
|
||||
|
||||
check_url_parsing()
|
||||
|
||||
links = []
|
||||
with open(path, 'r', encoding='utf-8') as file:
|
||||
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
|
||||
|
@ -192,7 +195,6 @@ def parse_shaarli_rss_export(rss_file):
|
|||
|
||||
yield info
|
||||
|
||||
|
||||
def parse_netscape_html_export(html_file):
|
||||
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
||||
|
||||
|
|
|
@ -58,8 +58,19 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
|||
|
||||
short_ts = lambda ts: ts.split('.')[0]
|
||||
|
||||
URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+'
|
||||
HTML_TITLE_REGEX = '<title>(.[^<>]+)'
|
||||
URL_REGEX = re.compile(
|
||||
r'http[s]?://' # start matching from allowed schemes
|
||||
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||
r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
|
||||
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
|
||||
r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
|
||||
re.IGNORECASE,
|
||||
)
|
||||
HTML_TITLE_REGEX = re.compile(
|
||||
r'<title>' # start matching text after <title> tag
|
||||
r'(.[^<>]+)', # get everything up to these symbols
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def check_dependencies():
|
||||
|
@ -124,6 +135,30 @@ def check_dependencies():
|
|||
raise SystemExit(1)
|
||||
|
||||
|
||||
def check_url_parsing():
|
||||
"""Check that plain text regex URL parsing works as expected"""
|
||||
test_urls = '''
|
||||
https://example1.com/what/is/happening.html?what=1#how-about-this=1
|
||||
https://example2.com/what/is/happening/?what=1#how-about-this=1
|
||||
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
|
||||
https://example4.com/what/is/happening.html
|
||||
https://example5.com/
|
||||
https://example6.com
|
||||
|
||||
<test>http://example7.com</test>
|
||||
[https://example8.com/what/is/this.php?what=1]
|
||||
[and http://example9.com?what=1&other=3#and-thing=2]
|
||||
<what>https://example10.com#and-thing=2 "</about>
|
||||
abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
|
||||
sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
|
||||
example13.bada
|
||||
and example14.badb
|
||||
<or>htt://example15.badc</that>
|
||||
'''
|
||||
# print('\n'.join(re.findall(URL_REGEX, test_urls)))
|
||||
assert len(re.findall(URL_REGEX, test_urls)) == 12
|
||||
|
||||
|
||||
def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
|
||||
"""chmod -R <permissions> <cwd>/<path>"""
|
||||
|
||||
|
|
Loading…
Reference in a new issue