minor url fixes and refactoring

This commit is contained in:
Nick Sweeting 2017-07-04 06:24:03 -05:00
parent 0df2bfe4c8
commit 881de8adbe
6 changed files with 46 additions and 22 deletions

View file

@ -22,6 +22,7 @@
<DT><A HREF="https://duckduckgo.com/?q=archive+firefox+bookmarks&t=ffab&ia=web" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://duckduckgo.com/favicon.ico" ICON="">archive firefox bookmarks at DuckDuckGo</A>
<DT><A HREF="https://github.com/nodiscc" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://assets-cdn.github.com/favicon.ico" ICON="">nodiscc (nodiscc) · GitHub</A>
<DT><A HREF="https://github.com/pirate/bookmark-archiver#troubleshooting" ADD_DATE="1497562975" LAST_MODIFIED="1497562975" ICON_URI="https://assets-cdn.github.com/favicon.ico" ICON="">pirate/bookmark-archiver · Github</A>
<DT><A HREF="http://www.cs.unc.edu/~fabian/papers/foniks-oak11.pdf" ADD_DATE="1497562976" LAST_MODIFIED="1497562976" ICON_URI="https://assets-cdn.github.com/favicon.ico" ICON="">Phonotactic Reconstruction of Encrypted VoIP Conversations</A>
<DT><A HREF="https://www.ghacks.net/2009/07/23/firefox-bookmarks-archiver/" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://www.ghacks.net/wp-content/uploads/2005/10/favicon.ico" ICON="">Firefox Bookmarks Archiver - gHacks Tech News</A>
</DL><p>
<DT><H3 ADD_DATE="1409779227" LAST_MODIFIED="1470506008" PERSONAL_TOOLBAR_FOLDER="true">Bookmarks Toolbar</H3>

View file

@ -32,7 +32,7 @@ def fetch_wget(out_dir, link, overwrite=False, requisites=True, timeout=60):
"""download full site using wget"""
domain = link['base_url'].split('/', 1)[0]
if not os.path.exists('{}/{}'.format(out_dir, domain)) or overwrite:
if not os.path.exists(os.path.join(out_dir, domain)) or overwrite:
print(' - Downloading Full Site')
CMD = [
*'wget --timestamping --adjust-extension --no-parent'.split(' '), # Docs: https://www.gnu.org/software/wget/manual/wget.html
@ -54,7 +54,9 @@ def fetch_wget(out_dir, link, overwrite=False, requisites=True, timeout=60):
def fetch_pdf(out_dir, link, overwrite=False, timeout=60, chrome_binary='chromium-browser'):
"""print PDF of site to file using chrome --headless"""
if (not os.path.exists('{}/output.pdf'.format(out_dir)) or overwrite) and link['type'] not in ('PDF', 'image'):
path = os.path.join(out_dir, 'output.pdf')
if (not os.path.exists(path) or overwrite) and link['type'] not in ('PDF', 'image'):
print(' - Printing PDF')
CMD = [
chrome_binary,
@ -76,7 +78,9 @@ def fetch_pdf(out_dir, link, overwrite=False, timeout=60, chrome_binary='chromiu
def fetch_screenshot(out_dir, link, overwrite=False, timeout=60, chrome_binary='chromium-browser', resolution='1440,900'):
"""take screenshot of site using chrome --headless"""
if (not os.path.exists('{}/screenshot.png'.format(out_dir)) or overwrite) and link['type'] not in ('PDF', 'image'):
path = os.path.join(out_dir, 'screenshot.png')
if (not os.path.exists(path) or overwrite) and link['type'] not in ('PDF', 'image'):
print(' - Snapping Screenshot')
CMD = [
chrome_binary,
@ -98,7 +102,10 @@ def fetch_screenshot(out_dir, link, overwrite=False, timeout=60, chrome_binary='
def archive_dot_org(out_dir, link, overwrite=False, timeout=60):
"""submit site to archive.org for archiving via their service, save returned archive url"""
if (not os.path.exists('{}/archive.org.txt'.format(out_dir)) or overwrite):
path = os.path.join(out_dir, 'archive.org.txt')
if not os.path.exists(path) or overwrite:
print(' - Submitting to archive.org')
submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0])
@ -129,7 +136,9 @@ def archive_dot_org(out_dir, link, overwrite=False, timeout=60):
def fetch_favicon(out_dir, link, overwrite=False, timeout=60):
"""download site favicon from google's favicon api"""
if not os.path.exists('{}/favicon.ico'.format(out_dir)) or overwrite:
path = os.path.join(out_dir, 'favicon.ico')
if not os.path.exists(path) or overwrite:
print(' - Fetching Favicon')
CMD = 'curl https://www.google.com/s2/favicons?domain={domain}'.format(**link).split(' ')
fout = open('{}/favicon.ico'.format(out_dir), 'w')
@ -149,7 +158,9 @@ def fetch_audio(out_dir, link, overwrite=False, timeout=60):
if link['type'] not in ('soundcloud',):
return
if (not os.path.exists('{}/audio'.format(out_dir)) or overwrite):
path = os.path.join(out_dir, 'audio')
if not os.path.exists(path) or overwrite:
print(' - Downloading audio')
CMD = [
"youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",
@ -173,8 +184,9 @@ def fetch_video(out_dir, link, overwrite=False, timeout=60):
if link['type'] not in ('youtube', 'youku', 'vimeo'):
return
path = os.path.join(out_dir, 'video')
if (not os.path.exists('{}/video'.format(out_dir)) or overwrite):
if not os.path.exists(path) or overwrite:
print(' - Downloading video')
CMD = [
"youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",

View file

@ -19,7 +19,12 @@ def dump_index(links, service):
link_html.format(**derived_link_info(link)) for link in links
)
template_vars = (datetime.now().strftime('%Y-%m-%d %H:%M'), article_rows)
template_vars = {
'num_links': len(links),
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
'rows': article_rows,
}
with open(os.path.join(service, 'index.html'), 'w', encoding='utf-8') as f:
f.write(index_html.format(*template_vars))
f.write(index_html.format(**template_vars))

View file

@ -168,8 +168,8 @@ def valid_links(links):
return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp'))
def calculate_archive_url(link):
"""calculate the path to the wgetted html file, since wget may
def html_appended_url(link):
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension."""
@ -190,7 +190,13 @@ def calculate_archive_url(link):
def derived_link_info(link):
"""extend link info with the archive urls and other derived data"""
link_info = {**link}
link_info = {
**link,
'files_url': 'archive/{timestamp}/'.format(**link),
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
}
# PDF and images are handled slightly differently
# wget, screenshot, & pdf urls all point to the same file
@ -203,7 +209,7 @@ def derived_link_info(link):
})
else:
link_info.update({
'archive_url': calculate_archive_url(link),
'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link)
})

View file

@ -64,11 +64,11 @@
</head>
<body>
<header>
<h1 title="Last modified {}">
<h1 title="Last modified {time_updated}">
<img src="https://nicksweeting.com/images/archive.png" height="36px">
Archived Sites <img src="https://getpocket.com/favicon.ico" height="36px"> <br/>
<small>
Exported with: <a href="https://github.com/pirate/bookmark-archiver">Bookmark Archiver</a>
Archived with: <a href="https://github.com/pirate/bookmark-archiver">Bookmark Archiver</a> on {date_updated}
</small>
</h1>
</header>
@ -76,7 +76,7 @@
<thead>
<tr>
<th style="width: 120px;"><img src="https://getpocket.com/favicon.ico" height="12px"> Starred</th>
<th style="width: 45vw;">Saved Article</th>
<th style="width: 45vw;">Saved Articles ({num_links})</th>
<th style="width: 50px">Files</th>
<th style="width: 50px">PDF</th>
<th style="width: 60px;font-size:0.8em;">Screenshot</th>
@ -84,7 +84,7 @@
<th style="width: 100px;whitespace:nowrap;overflow-x:scroll;display:block">Original URL</th>
</tr>
</thead>
<tbody>{}</tbody>
<tbody>{rows}</tbody>
</table>
</body>
</html>

View file

@ -1,12 +1,12 @@
<tr>
<td>{time}</td>
<td><a href="archive/{timestamp}/{archive_url}" style="font-size:1.4em;text-decoration:none;color:black;" title="{title}">
<img src="archive/{timestamp}/favicon.ico">
<td><a href="{archive_url}" style="font-size:1.4em;text-decoration:none;color:black;" title="{title}">
<img src="{favicon_url}">
{title} <small style="background-color: #eee;border-radius:4px; float:right">{tags}</small>
</td>
<td style="text-align:center"><a href="archive/{timestamp}/" title="Files">📂</a></td>
<td style="text-align:center"><a href="{files_url}" title="Files">📂</a></td>
<td style="text-align:center"><a href="{pdf_link}" title="PDF">📄</a></td>
<td style="text-align:center"><a href="{screenshot_link}" title="Screenshot">🖼</a></td>
<td style="text-align:center"><a href="https://web.archive.org/web/{base_url}" title="Archive.org">🏛</a></td>
<td>🔗 <img src="https://www.google.com/s2/favicons?domain={domain}" height="16px"> <a href="{url}">{url}</a></td>
<td style="text-align:center"><a href="{archive_org_url}" title="Archive.org">🏛</a></td>
<td>🔗 <img src="{google_favicon_url}" height="16px"> <a href="{url}">{url}</a></td>
</tr