From 8526906779382a6ece6d2a4a257b04393a480fd4 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@nicksweeting.com>
Date: Tue, 17 Apr 2018 09:44:07 -0400
Subject: [PATCH] fix urlencoding of wget path

---
 util.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/util.py b/util.py
index 63bdcc51..f7c88ae4 100644
--- a/util.py
+++ b/util.py
@@ -403,8 +403,10 @@ def wget_output_path(link, look_in=None):
     See docs on wget --adjust-extension (-E)
     """
 
+    urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
+
     if link['type'] in ('PDF', 'image'):
-        return quote(link['base_url'])
+        return urlencode(link['base_url'])
 
     # Since the wget algorithm to for -E (appending .html) is incredibly complex
     # instead of trying to emulate it here, we just look in the output folder
@@ -418,7 +420,7 @@ def wget_output_path(link, look_in=None):
             if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
         ]
         if html_files:
-            return quote(os.path.join(wget_folder, html_files[0]))
+            return urlencode(os.path.join(wget_folder, html_files[0]))
 
     # If finding the actual output file didn't work, fall back to the buggy
     # implementation of the wget .html appending algorithm
@@ -427,20 +429,20 @@ def wget_output_path(link, look_in=None):
 
     if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
         # already ends in .html
-        return quote(link['base_url'])
+        return urlencode(link['base_url'])
     else:
         # .html needs to be appended
         without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
         if without_scheme.endswith('/'):
             if query:
-                return quote('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
-            return quote('#'.join([without_scheme + 'index.html', *split_url[1:]]))
+                return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
+            return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
         else:
             if query:
-                return quote('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
+                return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
             elif '/' in without_scheme:
-                return quote('#'.join([without_scheme + '.html', *split_url[1:]]))
-            return quote(link['base_url'] + '/index.html')
+                return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
+            return urlencode(link['base_url'] + '/index.html')
 
 
 def derived_link_info(link):