From 128419f99181371a89aae9b9999aa1d548b612cf Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 24 Apr 2024 17:50:18 -0700 Subject: [PATCH] expand comment about markdown url trailing paren trimming --- archivebox/util.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/archivebox/util.py b/archivebox/util.py index e19510f8..e7de03b4 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -90,6 +90,11 @@ def fix_url_from_markdown(url_str: str) -> str: helpful to fix URLs parsed from markdown e.g. input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def + + IMPORTANT ASSUMPTION: valid urls wont have unbalanced or incorrectly nested parentheses + e.g. this will fail the user actually wants to ingest a url like 'https://example.com/some_wei)(rd_url' + in that case it will return https://example.com/some_wei (truncated up to the first unbalanced paren) + This assumption is true 99.9999% of the time, and for the rare edge case the user can use url_list parser. """ trimmed_url = url_str