de-duplicate timstamps if using url

This commit is contained in:
Nick Sweeting 2017-06-15 17:33:01 -05:00
parent 8ac6096b39
commit 5de6b3adc1

View file

@ -216,6 +216,52 @@ def fetch_favicon(out_dir, link, overwrite=False):
### ORCHESTRATION
def next_uniq_timestamp(used_timestamps, timestamp):
"""resolve duplicate timestamps by appending a decimal"""
if timestamp not in used_timestamps:
return timestamp
if '.' in timestamp:
timestamp, nonce = timestamp.split('.')
nonce = int(nonce)
else:
nonce = 1
new_timestamp = '{}.{}'.format(timestamp, nonce)
while new_timestamp in used_timestamps:
nonce += 1
new_timestamp = '{}.{}'.format(timestamp, nonce)
return new_timestamp
def uniquefied_links(links):
"""uniqueify link timestamps by de-duping using url, returns links sorted most recent -> oldest
needed because firefox will produce exports where many links share the same timestamp, this func
ensures that all non-duplicate links have monotonically increasing timestamps"""
links = list(reversed(sorted(links, key=lambda l: (l['timestamp'], l['url']))))
seen_timestamps = {}
for link in links:
t = link['timestamp']
if t in seen_timestamps:
if link['url'] == seen_timestamps[t]['url']:
# don't create new unique timestamp if link is the same
continue
else:
# resolve duplicate timstamp by appending a decimal
link['timestamp'] = next_uniq_timestamp(seen_timestamps, link['timestamp'])
seen_timestamps[link['timestamp']] = link
return links
def valid_links(links):
return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp'))
def dump_index(links, service):
with open(INDEX_TEMPLATE, 'r') as f:
index_html = f.read()
@ -271,6 +317,10 @@ def dump_website(link, service, overwrite=False):
if link['type']:
print(' i Type: {}'.format(link['type']))
if not link['url'].startswith('http'):
print(' X Skipping: invalid link.')
return
if FETCH_WGET:
fetch_wget(out_dir, link, overwrite=overwrite)