mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-26 06:00:22 +00:00
54 lines
1.5 KiB
Python
Executable file
54 lines
1.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
import argparse
|
|
import re
|
|
from typing import List
|
|
|
|
from archive import parse_json_link_index
|
|
from config import OUTPUT_DIR
|
|
from index import write_json_links_index
|
|
|
|
|
|
def cleanup_index(patterns: List[str], yes=False):
|
|
regexes = [re.compile(p) for p in patterns]
|
|
|
|
index = parse_json_link_index(OUTPUT_DIR)
|
|
links = index['links']
|
|
|
|
filtered = []
|
|
remaining = []
|
|
for l in links:
|
|
url = l['url']
|
|
for r in regexes:
|
|
if r.search(url):
|
|
filtered.append((l, r))
|
|
break
|
|
else:
|
|
remaining.append(l)
|
|
|
|
|
|
print("Filtered out {}/{} urls:".format(len(filtered), len(links)))
|
|
for link, regex in filtered:
|
|
url = link['url']
|
|
print(" {url} via {regex}".format(url=url, regex=regex.pattern))
|
|
|
|
proceed = False
|
|
if yes:
|
|
proceed = True
|
|
else:
|
|
res = input("Remove {} entries from index? [y/n] ".format(len(filtered)))
|
|
proceed = res.strip().lower() in ('y', 'yes')
|
|
|
|
if proceed:
|
|
write_json_links_index(OUTPUT_DIR, remaining)
|
|
else:
|
|
exit('aborting')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
p = argparse.ArgumentParser('Index purging tool')
|
|
p.add_argument('--regex', '-r', action='append', help='Python regex to filter out')
|
|
p.add_argument('--yes', action='store_true', default=False, help='Do not propmpt for confirmation')
|
|
|
|
args = p.parse_args()
|
|
regexes = args.regex
|
|
cleanup_index(regexes, yes=args.yes)
|