mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2025-02-16 21:38:33 +00:00
support finding multiple urls as substrings in text
This commit is contained in:
parent
f3a3d76439
commit
3e26ae4a66
1 changed files with 3 additions and 1 deletions
|
@ -56,11 +56,13 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
|
|||
|
||||
|
||||
URL_REGEX = re.compile(
|
||||
r'(?=('
|
||||
r'http[s]?://' # start matching from allowed schemes
|
||||
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||
r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
|
||||
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
|
||||
r'[^\]\[\(\)<>"\'\s]+', # stop parsing at these symbols
|
||||
r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols
|
||||
r'))',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue