diff --git a/archivebox/__init__.py b/archivebox/__init__.py
index 4cd3afd5..485a340d 100644
--- a/archivebox/__init__.py
+++ b/archivebox/__init__.py
@@ -1,3 +1,6 @@
__package__ = 'archivebox'
from . import core
+from . import cli
+
+from .main import *
diff --git a/archivebox/__main__.py b/archivebox/__main__.py
index 570a8c21..3386d46d 100755
--- a/archivebox/__main__.py
+++ b/archivebox/__main__.py
@@ -2,9 +2,14 @@
__package__ = 'archivebox'
-from .cli.archivebox import main
+import sys
+from .cli import archivebox
+
+
+def main():
+ archivebox.main(args=sys.argv[1:], stdin=sys.stdin)
if __name__ == '__main__':
- main()
+ archivebox.main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py
index 38c577c7..f4cd99b9 100644
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -2,13 +2,17 @@ __package__ = 'archivebox.cli'
import os
-from typing import Dict
+from typing import Dict, List, Optional, IO
from importlib import import_module
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
# these common commands will appear sorted before any others for ease-of-use
-display_first = ('help', 'version', 'init', 'info', 'config', 'list', 'update', 'add', 'remove')
+meta_cmds = ('help', 'version')
+main_cmds = ('init', 'info', 'config')
+archive_cmds = ('add', 'remove', 'update', 'list')
+
+display_first = (*meta_cmds, *main_cmds, *archive_cmds)
# every imported command module must have these properties in order to be valid
required_attrs = ('__package__', '__command__', 'main')
@@ -42,11 +46,14 @@ def list_subcommands() -> Dict[str, str]:
return dict(sorted(COMMANDS, key=display_order))
-def run_subcommand(subcommand: str, args=None) -> None:
+def run_subcommand(subcommand: str,
+ subcommand_args: List[str]=None,
+ stdin: Optional[IO]=None,
+ pwd: Optional[str]=None) -> None:
"""run a given ArchiveBox subcommand with the given list of args"""
module = import_module('.archivebox_{}'.format(subcommand), __package__)
- module.main(args) # type: ignore
+ module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
SUBCOMMANDS = list_subcommands()
diff --git a/archivebox/cli/archivebox.py b/archivebox/cli/archivebox.py
index d1326721..d6fe207c 100755
--- a/archivebox/cli/archivebox.py
+++ b/archivebox/cli/archivebox.py
@@ -5,19 +5,17 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox'
__description__ = 'ArchiveBox: The self-hosted internet archive.'
-import os
import sys
import argparse
+from typing import Optional, List, IO
+
from . import list_subcommands, run_subcommand
-from ..legacy.config import OUTPUT_DIR
+from ..config import OUTPUT_DIR
-def parse_args(args=None):
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
subcommands = list_subcommands()
-
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@@ -43,54 +41,24 @@ def parse_args(args=None):
default=None,
)
parser.add_argument(
- "args",
+ "subcommand_args",
help="Arguments for the subcommand",
nargs=argparse.REMAINDER,
)
-
- command = parser.parse_args(args)
+ command = parser.parse_args(args or ())
- if command.help:
+ if command.help or command.subcommand is None:
command.subcommand = 'help'
if command.version:
command.subcommand = 'version'
- # print('--------------------------------------------')
- # print('Command: ', sys.argv[0])
- # print('Subcommand: ', command.subcommand)
- # print('Args to pass:', args[1:])
- # print('--------------------------------------------')
+ run_subcommand(
+ subcommand=command.subcommand,
+ subcommand_args=command.subcommand_args,
+ stdin=stdin,
+ pwd=pwd or OUTPUT_DIR,
+ )
- return command.subcommand, command.args
-
-
-def print_import_tutorial():
- print('Welcome to ArchiveBox!')
- print()
- print('To import an existing archive (from a previous version of ArchiveBox):')
- print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
- print(' 2. archivebox init')
- print()
- print('To start a new archive:')
- print(' 1. Create an emptry directory, then cd into it and run:')
- print(' 2. archivebox init')
- print()
- print('For more information, see the migration docs here:')
- print(' https://github.com/pirate/ArchiveBox/wiki/Migration')
-
-def main(args=None):
- subcommand, subcommand_args = parse_args(args)
- existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
-
- if subcommand is None:
- if existing_index:
- run_subcommand('help', subcommand_args)
- else:
- print_import_tutorial()
- raise SystemExit(0)
-
- run_subcommand(subcommand, subcommand_args)
-
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index 714e916c..d0209916 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -7,90 +7,75 @@ __description__ = 'Add a new URL or list of URLs to your archive'
import sys
import argparse
-from typing import List, Optional
+from typing import List, Optional, IO
-from ..legacy.config import stderr, check_dependencies, check_data_folder
-from ..legacy.util import (
- handle_stdin_import,
- handle_file_import,
-)
-from ..legacy.main import update_archive_data
+from ..main import add
+from ..util import SmartFormatter, accept_stdin
+from ..config import OUTPUT_DIR, ONLY_NEW
-def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
- check_data_folder()
-
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
+ formatter_class=SmartFormatter,
)
- # parser.add_argument(
- # '--depth', #'-d',
- # type=int,
- # help='Recursively archive all linked pages up to this many hops away',
- # default=0,
- # )
parser.add_argument(
- '--only-new', #'-n',
+ '--update-all', #'-n',
action='store_true',
- help="Don't attempt to retry previously skipped/failed links when updating",
+ default=not ONLY_NEW,
+ help="Also retry previously skipped/failed links when adding new links",
)
parser.add_argument(
'--index-only', #'-o',
action='store_true',
help="Add the links to the main index without archiving them",
)
- # parser.add_argument(
- # '--mirror', #'-m',
- # action='store_true',
- # help='Archive an entire site (finding all linked pages below it on the same domain)',
- # )
- # parser.add_argument(
- # '--crawler', #'-r',
- # choices=('depth_first', 'breadth_first'),
- # help='Controls which crawler to use in order to find outlinks in a given page',
- # default=None,
- # )
parser.add_argument(
- 'url',
+ 'import_path',
nargs='?',
type=str,
default=None,
- help='URL of page to archive (or path to local file)'
+ help=(
+ 'URL or path to local file containing a list of links to import. e.g.:\n'
+ ' https://getpocket.com/users/USERNAME/feed/all\n'
+ ' https://example.com/some/rss/feed.xml\n'
+ ' ~/Downloads/firefox_bookmarks_export.html\n'
+ ' ~/Desktop/sites_list.csv\n'
+ )
)
- command = parser.parse_args(args)
-
- check_dependencies()
-
- ### Handle ingesting urls piped in through stdin
- # (.e.g if user does cat example_urls.txt | archivebox add)
- import_path = None
- if stdin or not sys.stdin.isatty():
- stdin_raw_text = stdin or sys.stdin.read()
- if stdin_raw_text and command.url:
- stderr(
- '[X] You should pass either a path as an argument, '
- 'or pass a list of links via stdin, but not both.\n'
- )
- raise SystemExit(1)
-
- import_path = handle_stdin_import(stdin_raw_text)
-
- ### Handle ingesting url from a remote file/feed
- # (e.g. if an RSS feed URL is used as the import path)
- elif command.url:
- import_path = handle_file_import(command.url)
-
- update_archive_data(
- import_path=import_path,
- resume=None,
- only_new=command.only_new,
+ command = parser.parse_args(args or ())
+ import_str = accept_stdin(stdin)
+ add(
+ import_str=import_str,
+ import_path=command.import_path,
+ update_all=command.update_all,
index_only=command.index_only,
+ out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
+
+
+# TODO: Implement these
+#
+# parser.add_argument(
+# '--depth', #'-d',
+# type=int,
+# help='Recursively archive all linked pages up to this many hops away',
+# default=0,
+# )
+# parser.add_argument(
+# '--mirror', #'-m',
+# action='store_true',
+# help='Archive an entire site (finding all linked pages below it on the same domain)',
+# )
+# parser.add_argument(
+# '--crawler', #'-r',
+# choices=('depth_first', 'breadth_first'),
+# help='Controls which crawler to use in order to find outlinks in a given page',
+# default=None,
+# )
diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py
index 97a8447d..2d373535 100644
--- a/archivebox/cli/archivebox_config.py
+++ b/archivebox/cli/archivebox_config.py
@@ -7,28 +7,14 @@ __description__ = 'Get and set your ArchiveBox project configuration values'
import sys
import argparse
-from typing import Optional, List
+from typing import Optional, List, IO
-from ..legacy.util import SmartFormatter
-from ..legacy.config import (
- check_data_folder,
- OUTPUT_DIR,
- load_all_config,
- write_config_file,
- CONFIG,
- CONFIG_FILE,
- USER_CONFIG,
- ConfigDict,
- stderr,
- get_real_name,
-)
+from ..main import config
+from ..util import SmartFormatter, accept_stdin
+from ..config import OUTPUT_DIR
-def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
- check_data_folder()
-
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@@ -57,102 +43,18 @@ def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
type=str,
help='KEY or KEY=VALUE formatted config values to get or set',
)
- command = parser.parse_args(args)
+ command = parser.parse_args(args or ())
+ config_options_str = accept_stdin(stdin)
- if stdin or not sys.stdin.isatty():
- stdin_raw_text = stdin or sys.stdin.read()
- if stdin_raw_text and command.config_options:
- stderr(
- '[X] You should either pass config values as an arguments '
- 'or via stdin, but not both.\n',
- color='red',
- )
- raise SystemExit(1)
-
- config_options = stdin_raw_text.split('\n')
- else:
- config_options = command.config_options
-
- no_args = not (command.get or command.set or command.reset or command.config_options)
-
- matching_config: ConfigDict = {}
- if command.get or no_args:
- if config_options:
- config_options = [get_real_name(key) for key in config_options]
- matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
- failed_config = [key for key in config_options if key not in CONFIG]
- if failed_config:
- stderr()
- stderr('[X] These options failed to get', color='red')
- stderr(' {}'.format('\n '.join(config_options)))
- raise SystemExit(1)
- else:
- matching_config = CONFIG
-
- print(printable_config(matching_config))
- raise SystemExit(not matching_config)
- elif command.set:
- new_config = {}
- failed_options = []
- for line in config_options:
- if line.startswith('#') or not line.strip():
- continue
- if '=' not in line:
- stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
- stderr(f' {line}')
- raise SystemExit(2)
-
- raw_key, val = line.split('=')
- raw_key = raw_key.upper().strip()
- key = get_real_name(raw_key)
- if key != raw_key:
- stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
-
- if key in CONFIG:
- new_config[key] = val.strip()
- else:
- failed_options.append(line)
-
- if new_config:
- before = CONFIG
- matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
- after = load_all_config()
- print(printable_config(matching_config))
-
- side_effect_changes: ConfigDict = {}
- for key, val in after.items():
- if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
- side_effect_changes[key] = after[key]
-
- if side_effect_changes:
- stderr()
- stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
- print(' {}'.format(printable_config(side_effect_changes, prefix=' ')))
- if failed_options:
- stderr()
- stderr('[X] These options failed to set:', color='red')
- stderr(' {}'.format('\n '.join(failed_options)))
- raise SystemExit(bool(failed_options))
- elif command.reset:
- stderr('[X] This command is not implemented yet.', color='red')
- stderr(' Please manually remove the relevant lines from your config file:')
- stderr(f' {CONFIG_FILE}')
- raise SystemExit(2)
-
- else:
- stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
- stderr(' archivebox config')
- stderr(' archivebox config --get SOME_KEY')
- stderr(' archivebox config --set SOME_KEY=SOME_VALUE')
- raise SystemExit(2)
-
-
-def printable_config(config: ConfigDict, prefix: str='') -> str:
- return f'\n{prefix}'.join(
- f'{key}={val}'
- for key, val in config.items()
- if not (isinstance(val, dict) or callable(val))
+ config(
+ config_options_str=config_options_str,
+ config_options=command.config_options,
+ get=command.get,
+ set=command.set,
+ reset=command.reset,
+ out_dir=pwd or OUTPUT_DIR,
)
+
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_help.py b/archivebox/cli/archivebox_help.py
index b049ef70..b1cf1c5b 100755
--- a/archivebox/cli/archivebox_help.py
+++ b/archivebox/cli/archivebox_help.py
@@ -7,52 +7,24 @@ __description__ = 'Print the ArchiveBox help message and usage'
import sys
import argparse
-from ..legacy.util import reject_stdin
-from ..legacy.config import ANSI
-from . import list_subcommands
+from typing import Optional, List, IO
+
+from ..main import help
+from ..util import reject_stdin
+from ..config import OUTPUT_DIR
-def main(args=None):
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
- parser.parse_args(args)
- reject_stdin(__command__)
+ parser.parse_args(args or ())
+ reject_stdin(__command__, stdin)
-
- COMMANDS_HELP_TEXT = '\n '.join(
- f'{cmd.ljust(20)} {summary}'
- for cmd, summary in list_subcommands().items()
- )
-
- print('''{green}ArchiveBox: The self-hosted internet archive.{reset}
-
-{lightblue}Usage:{reset}
- archivebox [command] [--help] [--version] [...args]
-
-{lightblue}Comamnds:{reset}
- {}
-
-{lightblue}Example Use:{reset}
- mkdir my-archive; cd my-archive/
- archivebox init
- archivebox info
-
- archivebox add https://example.com/some/page
- archivebox add --depth=1 ~/Downloads/bookmarks_export.html
-
- archivebox list --sort=timestamp --csv=timestamp,url,is_archived
- archivebox schedule --every=week https://example.com/some/feed.rss
- archivebox update --resume=15109948213.123
-
-{lightblue}Documentation:{reset}
- https://github.com/pirate/ArchiveBox/wiki
-'''.format(COMMANDS_HELP_TEXT, **ANSI))
+ help(out_dir=pwd or OUTPUT_DIR)
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_info.py b/archivebox/cli/archivebox_info.py
index bf04d89e..d3cc99aa 100644
--- a/archivebox/cli/archivebox_info.py
+++ b/archivebox/cli/archivebox_info.py
@@ -7,25 +7,24 @@ __description__ = 'Print out some info and statistics about the archive collecti
import sys
import argparse
-from ..legacy.config import check_data_folder
-from ..legacy.util import reject_stdin
-from ..legacy.main import info
+from typing import Optional, List, IO
+
+from ..main import info
+from ..config import OUTPUT_DIR
+from ..util import reject_stdin
-def main(args=None):
- check_data_folder()
-
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
- parser.parse_args(args)
- reject_stdin(__command__)
+ parser.parse_args(args or ())
+ reject_stdin(__command__, stdin)
+
+ info(out_dir=pwd or OUTPUT_DIR)
- info()
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py
index 632b9a1e..a66f011c 100755
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -7,23 +7,24 @@ __description__ = 'Initialize a new ArchiveBox collection in the current directo
import sys
import argparse
-from ..legacy.util import reject_stdin
-from ..legacy.main import init
+from typing import Optional, List, IO
+
+from ..main import init
+from ..util import reject_stdin
+from ..config import OUTPUT_DIR
-def main(args=None):
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
- parser.parse_args(args)
- reject_stdin(__command__)
+ parser.parse_args(args or ())
+ reject_stdin(__command__, stdin)
- init()
+ init(out_dir=pwd or OUTPUT_DIR)
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py
index 416fa3d7..126ad144 100644
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@@ -2,15 +2,17 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox list'
-__description__ = 'List all the URLs currently in the archive.'
+__description__ = 'List, filter, and export information about archive entries'
import sys
import argparse
-from ..legacy.util import SmartFormatter, reject_stdin, to_json, to_csv
-from ..legacy.config import check_data_folder, OUTPUT_DIR
-from ..legacy.main import (
- list_archive_data,
+from typing import Optional, List, IO
+
+from ..main import list_all
+from ..util import SmartFormatter, accept_stdin
+from ..config import OUTPUT_DIR
+from ..index import (
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
@@ -23,11 +25,7 @@ from ..legacy.main import (
get_unrecognized_folders,
)
-def main(args=None):
- check_data_folder()
-
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@@ -93,57 +91,27 @@ def main(args=None):
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
- 'patterns',
+ 'filter_patterns',
nargs='*',
type=str,
default=None,
help='List only URLs matching these filter patterns.'
)
- command = parser.parse_args(args)
- reject_stdin(__command__)
+ command = parser.parse_args(args or ())
+ filter_patterns_str = accept_stdin(stdin)
- links = list_archive_data(
- filter_patterns=command.patterns,
+ list_all(
+ filter_patterns_str=filter_patterns_str,
+ filter_patterns=command.filter_patterns,
filter_type=command.filter_type,
- before=command.before,
+ status=command.status,
after=command.after,
+ before=command.before,
+ sort=command.sort,
+ csv=command.csv,
+ json=command.json,
+ out_dir=pwd or OUTPUT_DIR,
)
- if command.sort:
- links = sorted(links, key=lambda link: getattr(link, command.sort))
-
- links = list(links)
-
- if command.status == 'indexed':
- folders = get_indexed_folders(links, out_dir=OUTPUT_DIR)
- elif command.status == 'archived':
- folders = get_archived_folders(links, out_dir=OUTPUT_DIR)
- elif command.status == 'unarchived':
- folders = get_unarchived_folders(links, out_dir=OUTPUT_DIR)
-
- elif command.status == 'present':
- folders = get_present_folders(links, out_dir=OUTPUT_DIR)
- elif command.status == 'valid':
- folders = get_valid_folders(links, out_dir=OUTPUT_DIR)
- elif command.status == 'invalid':
- folders = get_invalid_folders(links, out_dir=OUTPUT_DIR)
-
- elif command.status == 'duplicate':
- folders = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
- elif command.status == 'orphaned':
- folders = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
- elif command.status == 'corrupted':
- folders = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
- elif command.status == 'unrecognized':
- folders = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
-
- if command.csv:
- print(to_csv(folders.values(), csv_cols=command.csv.split(','), header=True))
- elif command.json:
- print(to_json(folders.values(), indent=4, sort_keys=True))
- else:
- print('\n'.join(f'{folder} {link}' for folder, link in folders.items()))
- raise SystemExit(not folders)
-
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_manage.py b/archivebox/cli/archivebox_manage.py
index 9d1c8eb3..f2b91cc2 100644
--- a/archivebox/cli/archivebox_manage.py
+++ b/archivebox/cli/archivebox_manage.py
@@ -6,24 +6,18 @@ __description__ = 'Run an ArchiveBox Django management command'
import sys
-from ..legacy.config import OUTPUT_DIR, setup_django, check_data_folder
+from typing import Optional, List, IO
+
+from ..main import manage
+from ..config import OUTPUT_DIR
-def main(args=None):
- check_data_folder()
-
- setup_django(OUTPUT_DIR)
- from django.core.management import execute_from_command_line
-
- args = sys.argv if args is None else ['archivebox', *args]
-
- args[0] = f'{sys.argv[0]} manage'
-
- if args[1:] == []:
- args.append('help')
-
- execute_from_command_line(args)
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
+ manage(
+ args=args,
+ out_dir=pwd or OUTPUT_DIR,
+ )
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py
index 4ddba354..c5f5ff53 100644
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -7,17 +7,14 @@ __description__ = 'Remove the specified URLs from the archive.'
import sys
import argparse
+from typing import Optional, List, IO
-from ..legacy.config import check_data_folder
-from ..legacy.util import reject_stdin
-from ..legacy.main import remove_archive_links
+from ..main import remove
+from ..util import accept_stdin
+from ..config import OUTPUT_DIR
-def main(args=None):
- check_data_folder()
-
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@@ -56,33 +53,25 @@ def main(args=None):
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
- 'pattern',
+ 'filter_patterns',
nargs='*',
type=str,
- default=None,
help='URLs matching this filter pattern will be removed from the index.'
)
- command = parser.parse_args(args)
+ command = parser.parse_args(args or ())
+ filter_str = accept_stdin(stdin)
- if not sys.stdin.isatty():
- stdin_raw_text = sys.stdin.read()
- if stdin_raw_text and command.url:
- print(
- '[X] You should pass either a pattern as an argument, '
- 'or pass a list of patterns via stdin, but not both.\n'
- )
- raise SystemExit(1)
-
- patterns = [pattern.strip() for pattern in stdin_raw_text.split('\n')]
- else:
- patterns = command.pattern
-
- remove_archive_links(
- filter_patterns=patterns, filter_type=command.filter_type,
- before=command.before, after=command.after,
- yes=command.yes, delete=command.delete,
+ remove(
+ filter_str=filter_str,
+ filter_patterns=command.filter_patterns,
+ filter_type=command.filter_type,
+ before=command.before,
+ after=command.after,
+ yes=command.yes,
+ delete=command.delete,
+ out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py
index f6e685f8..b6a15e13 100644
--- a/archivebox/cli/archivebox_schedule.py
+++ b/archivebox/cli/archivebox_schedule.py
@@ -4,34 +4,17 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox schedule'
__description__ = 'Set ArchiveBox to regularly import URLs at specific times using cron'
-import os
import sys
import argparse
-from datetime import datetime
-from crontab import CronTab, CronSlices
+from typing import Optional, List, IO
+
+from ..main import schedule
+from ..util import reject_stdin
+from ..config import OUTPUT_DIR
-from ..legacy.util import reject_stdin
-from ..legacy.config import (
- OUTPUT_DIR,
- LOGS_DIR,
- ARCHIVEBOX_BINARY,
- USER,
- ANSI,
- stderr,
- check_data_folder,
-)
-
-
-CRON_COMMENT = 'archivebox_schedule'
-
-
-def main(args=None):
- check_data_folder()
-
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@@ -57,7 +40,7 @@ def main(args=None):
group.add_argument(
'--clear', # '-c'
action='store_true',
- help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"),
+ help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"),
)
group.add_argument(
'--show', # '-s'
@@ -67,13 +50,14 @@ def main(args=None):
group.add_argument(
'--foreground', '-f',
action='store_true',
- help=("Launch ArchiveBox as a long-running foreground task "
+ help=("Launch ArchiveBox scheduler as a long-running foreground task "
"instead of using cron."),
)
group.add_argument(
'--run-all', # '-a',
action='store_true',
- help='Run all the scheduled jobs once immediately, independent of their configured schedules',
+ help=("Run all the scheduled jobs once immediately, independent of "
+ "their configured schedules, can be used together with --foreground"),
)
parser.add_argument(
'import_path',
@@ -83,115 +67,21 @@ def main(args=None):
help=("Check this path and import any new links on every run "
"(can be either local file or remote URL)"),
)
- command = parser.parse_args(args)
- reject_stdin(__command__)
+ command = parser.parse_args(args or ())
+ reject_stdin(__command__, stdin)
- os.makedirs(LOGS_DIR, exist_ok=True)
-
- cron = CronTab(user=True)
- cron = dedupe_jobs(cron)
-
- existing_jobs = list(cron.find_comment(CRON_COMMENT))
- if command.foreground or command.run_all:
- if command.import_path or (not existing_jobs):
- stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
- stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
- raise SystemExit(1)
- print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
- if command.run_all:
- try:
- for job in existing_jobs:
- sys.stdout.write(f' > {job.command}')
- sys.stdout.flush()
- job.run()
- sys.stdout.write(f'\r β {job.command}\n')
- except KeyboardInterrupt:
- print('\n{green}[β] Stopped.{reset}'.format(**ANSI))
- raise SystemExit(1)
- if command.foreground:
- try:
- for result in cron.run_scheduler():
- print(result)
- except KeyboardInterrupt:
- print('\n{green}[β] Stopped.{reset}'.format(**ANSI))
- raise SystemExit(1)
-
- elif command.show:
- if existing_jobs:
- print('\n'.join(str(cmd) for cmd in existing_jobs))
- else:
- stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
- stderr(' To schedule a new job, run:')
- stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
- raise SystemExit(0)
-
- elif command.clear:
- print(cron.remove_all(comment=CRON_COMMENT))
- cron.write()
- raise SystemExit(0)
-
- elif command.every:
- quoted = lambda s: f'"{s}"' if s and ' ' in s else s
- cmd = [
- 'cd',
- quoted(OUTPUT_DIR),
- '&&',
- quoted(ARCHIVEBOX_BINARY),
- *(('add', f'"{command.import_path}"',) if command.import_path else ('update',)),
- '2>&1',
- '>',
- quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
-
- ]
- new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
-
- if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
- set_every = getattr(new_job.every(), command.every)
- set_every()
- elif CronSlices.is_valid(command.every):
- new_job.setall(command.every)
- else:
- stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
- stderr(' It must be one of minute/hour/day/week/month')
- stderr(' or a quoted cron-format schedule like:')
- stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml')
- stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
- raise SystemExit(1)
-
- cron = dedupe_jobs(cron)
- cron.write()
-
- total_runs = sum(j.frequency_per_year() for j in cron)
- existing_jobs = list(cron.find_comment(CRON_COMMENT))
-
- print()
- print('{green}[β] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
- print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
- if total_runs > 60 and not command.quiet:
- stderr()
- stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
- stderr(f' Congrats on being an enthusiastic internet archiver! π')
- stderr()
- stderr(' Make sure you have enough storage space available to hold all the data.')
- stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
- raise SystemExit(0)
-
-
-def dedupe_jobs(cron: CronTab) -> CronTab:
- deduped = set()
- for job in list(cron):
- unique_tuple = (str(job.slices), job.command)
- if unique_tuple not in deduped:
- deduped.add(unique_tuple)
- cron.remove(job)
-
- for schedule, command in deduped:
- job = cron.new(command=command, comment=CRON_COMMENT)
- job.setall(schedule)
- job.enable()
-
- return cron
+ schedule(
+ add=command.add,
+ show=command.show,
+ clear=command.clear,
+ foreground=command.foreground,
+ run_all=command.run_all,
+ quiet=command.quiet,
+ every=command.every,
+ import_path=command.import_path,
+ out_dir=pwd or OUTPUT_DIR,
+ )
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py
index 3fdaff5c..a5cf4b2c 100644
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -7,15 +7,14 @@ __description__ = 'Run the ArchiveBox HTTP server'
import sys
import argparse
-from ..legacy.config import setup_django, IS_TTY, OUTPUT_DIR, ANSI, check_data_folder
-from ..legacy.util import reject_stdin
+from typing import Optional, List, IO
+
+from ..main import server
+from ..util import reject_stdin
+from ..config import OUTPUT_DIR
-def main(args=None):
- check_data_folder()
-
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@@ -33,26 +32,15 @@ def main(args=None):
action='store_true',
help='Enable auto-reloading when code or templates change',
)
- command = parser.parse_args(args)
- reject_stdin(__command__)
+ command = parser.parse_args(args or ())
+ reject_stdin(__command__, stdin)
- setup_django(OUTPUT_DIR)
- from django.core.management import call_command
- from django.contrib.auth.models import User
-
- if IS_TTY and not User.objects.filter(is_superuser=True).exists():
- print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
- print()
- print(' To create an admin user, run:')
- print(' archivebox manage createsuperuser')
- print()
-
- print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
- if not command.reload:
- command.runserver_args.append('--noreload')
-
- call_command("runserver", *command.runserver_args)
+ server(
+ runserver_args=command.runserver_args,
+ reload=command.reload,
+ out_dir=pwd or OUTPUT_DIR,
+ )
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_shell.py b/archivebox/cli/archivebox_shell.py
index dd509e3f..04939328 100644
--- a/archivebox/cli/archivebox_shell.py
+++ b/archivebox/cli/archivebox_shell.py
@@ -7,27 +7,26 @@ __description__ = 'Enter an interactive ArchiveBox Django shell'
import sys
import argparse
-from ..legacy.config import setup_django, OUTPUT_DIR, check_data_folder
-from ..legacy.util import reject_stdin
+from typing import Optional, List, IO
+
+from ..main import shell
+from ..config import OUTPUT_DIR
+from ..util import reject_stdin
-def main(args=None):
- check_data_folder()
-
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
- parser.parse_args(args)
- reject_stdin(__command__)
+ parser.parse_args(args or ())
+ reject_stdin(__command__, stdin)
+
+ shell(
+ out_dir=pwd or OUTPUT_DIR,
+ )
- setup_django(OUTPUT_DIR)
- from django.core.management import call_command
- call_command("shell_plus")
-
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index e40b5b21..936e45ec 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -2,27 +2,36 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox update'
-__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.'
+__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links'
import sys
import argparse
-from typing import List
+from typing import List, Optional, IO
-from ..legacy.config import check_data_folder
-from ..legacy.util import reject_stdin
-from ..legacy.main import update_archive_data
+from ..main import update
+from ..util import SmartFormatter, accept_stdin
+from ..config import OUTPUT_DIR
+from ..index import (
+ get_indexed_folders,
+ get_archived_folders,
+ get_unarchived_folders,
+ get_present_folders,
+ get_valid_folders,
+ get_invalid_folders,
+ get_duplicate_folders,
+ get_orphaned_folders,
+ get_corrupted_folders,
+ get_unrecognized_folders,
+)
-def main(args: List[str]=None):
- check_data_folder()
-
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
+ formatter_class=SmartFormatter,
)
parser.add_argument(
'--only-new', #'-n',
@@ -40,16 +49,75 @@ def main(args: List[str]=None):
help='Resume the update process from a given timestamp',
default=None,
)
+ parser.add_argument(
+ '--overwrite', #'-x',
+ action='store_true',
+ help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
+ )
+ parser.add_argument(
+ '--before', #'-b',
+ type=float,
+ help="Update only links bookmarked before the given timestamp.",
+ default=None,
+ )
+ parser.add_argument(
+ '--after', #'-a',
+ type=float,
+ help="Update only links bookmarked after the given timestamp.",
+ default=None,
+ )
+ parser.add_argument(
+ '--status',
+ type=str,
+ choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
+ default='indexed',
+ help=(
+ 'Update only links or data directories that have the given status\n'
+ f' indexed {get_indexed_folders.__doc__} (the default)\n'
+ f' archived {get_archived_folders.__doc__}\n'
+ f' unarchived {get_unarchived_folders.__doc__}\n'
+ '\n'
+ f' present {get_present_folders.__doc__}\n'
+ f' valid {get_valid_folders.__doc__}\n'
+ f' invalid {get_invalid_folders.__doc__}\n'
+ '\n'
+ f' duplicate {get_duplicate_folders.__doc__}\n'
+ f' orphaned {get_orphaned_folders.__doc__}\n'
+ f' corrupted {get_corrupted_folders.__doc__}\n'
+ f' unrecognized {get_unrecognized_folders.__doc__}\n'
+ )
+ )
+ parser.add_argument(
+ '--filter-type',
+ type=str,
+ choices=('exact', 'substring', 'domain', 'regex'),
+ default='exact',
+ help='Type of pattern matching to use when filtering URLs',
+ )
+ parser.add_argument(
+ 'filter_patterns',
+ nargs='*',
+ type=str,
+ default=None,
+ help='List only URLs matching these filter patterns.'
+ )
command = parser.parse_args(args)
- reject_stdin(__command__)
+ filter_patterns_str = accept_stdin(stdin)
- update_archive_data(
- import_path=None,
+ update(
resume=command.resume,
only_new=command.only_new,
index_only=command.index_only,
+ overwrite=command.overwrite,
+ filter_patterns_str=filter_patterns_str,
+ filter_patterns=command.filter_patterns,
+ filter_type=command.filter_type,
+ status=command.status,
+ after=command.after,
+ before=command.before,
+ out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py
index 53fb4953..d3707161 100755
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -4,26 +4,17 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox version'
__description__ = 'Print the ArchiveBox version and dependency information'
-import os
-import re
import sys
import argparse
-from ..legacy.util import reject_stdin, human_readable_size
-from ..legacy.config import (
- ANSI,
- VERSION,
- CODE_LOCATIONS,
- CONFIG_LOCATIONS,
- DATA_LOCATIONS,
- DEPENDENCIES,
- check_dependencies,
-)
+from typing import Optional, List, IO
+
+from ..main import version
+from ..util import reject_stdin
+from ..config import OUTPUT_DIR
-def main(args=None):
- args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@@ -34,92 +25,14 @@ def main(args=None):
action='store_true',
help='Only print ArchiveBox version number and nothing else.',
)
- command = parser.parse_args(args)
- reject_stdin(__command__)
+ command = parser.parse_args(args or ())
+ reject_stdin(__command__, stdin)
- if command.quiet:
- print(VERSION)
- else:
- print('ArchiveBox v{}'.format(VERSION))
- print()
-
- print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
- for name, dependency in DEPENDENCIES.items():
- print_dependency_version(name, dependency)
-
- print()
- print('{white}[i] Code locations:{reset}'.format(**ANSI))
- for name, folder in CODE_LOCATIONS.items():
- print_folder_status(name, folder)
-
- print()
- print('{white}[i] Config locations:{reset}'.format(**ANSI))
- for name, folder in CONFIG_LOCATIONS.items():
- print_folder_status(name, folder)
-
- print()
- print('{white}[i] Data locations:{reset}'.format(**ANSI))
- for name, folder in DATA_LOCATIONS.items():
- print_folder_status(name, folder)
-
- print()
- check_dependencies()
-
-
-def print_folder_status(name, folder):
- if folder['enabled']:
- if folder['is_valid']:
- color, symbol, note = 'green', 'β', 'valid'
- else:
- color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
- else:
- color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
-
- if folder['path']:
- if os.path.exists(folder['path']):
- num_files = (
- f'{len(os.listdir(folder["path"]))} files'
- if os.path.isdir(folder['path']) else
- human_readable_size(os.path.getsize(folder['path']))
- )
- else:
- num_files = 'missing'
-
- print(
- ANSI[color],
- symbol,
- ANSI['reset'],
- name.ljust(24),
- (folder["path"] or '').ljust(70),
- num_files.ljust(14),
- ANSI[color],
- note,
- ANSI['reset'],
- )
-
-
-def print_dependency_version(name, dependency):
- if dependency['enabled']:
- if dependency['is_valid']:
- color, symbol, note = 'green', 'β', 'valid'
- version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
- else:
- color, symbol, note, version = 'red', 'X', 'invalid', '?'
- else:
- color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
-
- print(
- ANSI[color],
- symbol,
- ANSI['reset'],
- name.ljust(24),
- (dependency["path"] or '').ljust(70),
- version.ljust(14),
- ANSI[color],
- note,
- ANSI['reset'],
+ version(
+ quiet=command.quiet,
+ out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
- main()
+ main(args=sys.argv[1:], stdin=sys.stdin)
diff --git a/archivebox/legacy/logs.py b/archivebox/cli/logging.py
similarity index 94%
rename from archivebox/legacy/logs.py
rename to archivebox/cli/logging.py
index e0b34301..87a7fab1 100644
--- a/archivebox/legacy/logs.py
+++ b/archivebox/cli/logging.py
@@ -1,3 +1,5 @@
+__package__ = 'archivebox.cli'
+
import os
import sys
@@ -5,8 +7,8 @@ from datetime import datetime
from dataclasses import dataclass
from typing import Optional, List
-from .schema import Link, ArchiveResult
-from .config import ANSI, OUTPUT_DIR, IS_TTY
+from ..index.schema import Link, ArchiveResult
+from ..config import ANSI, OUTPUT_DIR, IS_TTY
@dataclass
@@ -80,7 +82,7 @@ def log_indexing_finished(out_path: str):
### Archiving Stage
-def log_archiving_started(num_links: int, resume: Optional[float]):
+def log_archiving_started(num_links: int, resume: Optional[float]=None):
start_ts = datetime.now()
_LAST_RUN_STATS.archiving_start_ts = start_ts
print()
@@ -92,7 +94,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]):
**ANSI,
))
else:
- print('{green}[βΆ] [{}] Updating content for {} pages in archive...{reset}'.format(
+ print('{green}[βΆ] [{}] Updating content for {} matching pages in archive...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI,
@@ -213,18 +215,18 @@ def log_archive_method_finished(result: ArchiveResult):
print()
-def log_list_started(filter_patterns: List[str], filter_type: str):
+def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format(
filter_type,
**ANSI,
))
- print(' {}'.format(' '.join(filter_patterns)))
+ print(' {}'.format(' '.join(filter_patterns or ())))
def log_list_finished(links):
- from .util import to_csv
+ from ..util import links_to_csv
print()
print('---------------------------------------------------------------------------------------------------')
- print(to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
+ print(links_to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
print('---------------------------------------------------------------------------------------------------')
print()
diff --git a/archivebox/tests.py b/archivebox/cli/tests.py
similarity index 97%
rename from archivebox/tests.py
rename to archivebox/cli/tests.py
index 921fa1e7..14d0e4c6 100755
--- a/archivebox/tests.py
+++ b/archivebox/cli/tests.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-__package__ = 'archivebox'
+__package__ = 'archivebox.cli'
import os
@@ -29,15 +29,15 @@ TEST_CONFIG = {
OUTPUT_DIR = 'data.tests'
os.environ.update(TEST_CONFIG)
-from .legacy.main import init
-from .legacy.index import load_main_index
-from .legacy.config import (
+from ..main import init
+from ..index import load_main_index
+from ..config import (
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
)
-from .cli import (
+from . import (
archivebox_init,
archivebox_add,
archivebox_remove,
diff --git a/archivebox/legacy/config.py b/archivebox/config/__init__.py
similarity index 96%
rename from archivebox/legacy/config.py
rename to archivebox/config/__init__.py
index a38451d1..eb62d3d8 100644
--- a/archivebox/legacy/config.py
+++ b/archivebox/config/__init__.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox.legacy'
+__package__ = 'archivebox.config'
import os
import io
@@ -13,7 +13,7 @@ from typing import Optional, Type, Tuple, Dict
from subprocess import run, PIPE, DEVNULL
from configparser import ConfigParser
-from .config_stubs import (
+from .stubs import (
SimpleConfigValueDict,
ConfigValue,
ConfigDict,
@@ -40,7 +40,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'GENERAL_CONFIG': {
'OUTPUT_DIR': {'type': str, 'default': None},
'CONFIG_FILE': {'type': str, 'default': None},
- 'ONLY_NEW': {'type': bool, 'default': False},
+ 'ONLY_NEW': {'type': bool, 'default': True},
'TIMEOUT': {'type': int, 'default': 60},
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'},
@@ -122,8 +122,7 @@ ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
VERSION_FILENAME = 'VERSION'
PYTHON_DIR_NAME = 'archivebox'
-LEGACY_DIR_NAME = 'legacy'
-TEMPLATES_DIR_NAME = 'templates'
+TEMPLATES_DIR_NAME = 'themes'
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
@@ -158,8 +157,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'REPO_DIR': {'default': lambda c: os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))},
'PYTHON_DIR': {'default': lambda c: os.path.join(c['REPO_DIR'], PYTHON_DIR_NAME)},
- 'LEGACY_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], LEGACY_DIR_NAME)},
- 'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['LEGACY_DIR'], TEMPLATES_DIR_NAME)},
+ 'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], TEMPLATES_DIR_NAME, 'legacy')},
'OUTPUT_DIR': {'default': lambda c: os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir)},
'ARCHIVE_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)},
@@ -210,7 +208,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
- 'CONFIG_LOCATIONS': {'default': lambda c: get_config_locations(c)},
+ 'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
}
@@ -370,6 +368,7 @@ def load_config(defaults: ConfigDefaultDict,
stderr(' For config documentation and examples see:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration')
stderr()
+ raise
raise SystemExit(2)
return extended_config
@@ -492,18 +491,13 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
'REPO_DIR': {
'path': os.path.abspath(config['REPO_DIR']),
'enabled': True,
- 'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], '.github')),
+ 'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], 'archivebox')),
},
'PYTHON_DIR': {
'path': os.path.abspath(config['PYTHON_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['PYTHON_DIR'], '__main__.py')),
},
- 'LEGACY_DIR': {
- 'path': os.path.abspath(config['LEGACY_DIR']),
- 'enabled': True,
- 'is_valid': os.path.exists(os.path.join(config['LEGACY_DIR'], 'util.py')),
- },
'TEMPLATES_DIR': {
'path': os.path.abspath(config['TEMPLATES_DIR']),
'enabled': True,
@@ -511,14 +505,9 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
},
}
-def get_config_locations(config: ConfigDict) -> ConfigValue:
+def get_external_locations(config: ConfigDict) -> ConfigValue:
abspath = lambda path: None if path is None else os.path.abspath(path)
return {
- 'CONFIG_FILE': {
- 'path': abspath(config['CHROME_USER_DATA_DIR']),
- 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
- 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')),
- },
'CHROME_USER_DATA_DIR': {
'path': abspath(config['CHROME_USER_DATA_DIR']),
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
@@ -553,11 +542,26 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'enabled': True,
'is_valid': os.path.exists(config['ARCHIVE_DIR']),
},
+ 'CONFIG_FILE': {
+ 'path': os.path.abspath(config['CONFIG_FILE']),
+ 'enabled': True,
+ 'is_valid': os.path.exists(config['CONFIG_FILE']),
+ },
'SQL_INDEX': {
+ 'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
+ 'enabled': True,
+ 'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
+ },
+ 'JSON_INDEX': {
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
},
+ 'HTML_INDEX': {
+ 'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
+ 'enabled': True,
+ 'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
+ },
}
def get_dependency_info(config: ConfigDict) -> ConfigValue:
@@ -731,7 +735,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
json_index_exists = os.path.exists(os.path.join(output_dir, JSON_INDEX_FILENAME))
if not json_index_exists:
- stderr('[X] No archive index was found in current directory.', color='red')
+ stderr('[X] No archive main index was found in current directory.', color='red')
stderr(f' {output_dir}')
stderr()
stderr(' Are you running archivebox in the right folder?')
@@ -743,7 +747,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
raise SystemExit(2)
sql_index_exists = os.path.exists(os.path.join(output_dir, SQL_INDEX_FILENAME))
- from .storage.sql import list_migrations
+ from ..index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]
diff --git a/archivebox/legacy/config_stubs.py b/archivebox/config/stubs.py
similarity index 99%
rename from archivebox/legacy/config_stubs.py
rename to archivebox/config/stubs.py
index b741bc3a..f7d5059a 100644
--- a/archivebox/legacy/config_stubs.py
+++ b/archivebox/config/stubs.py
@@ -17,6 +17,7 @@ class ConfigDict(BaseConfig, total=False):
SHOW_PROGRESS: bool
OUTPUT_DIR: str
+ CONFIG_FILE: str
ONLY_NEW: bool
TIMEOUT: int
MEDIA_TIMEOUT: int
@@ -63,7 +64,6 @@ class ConfigDict(BaseConfig, total=False):
ANSI: Dict[str, str]
REPO_DIR: str
PYTHON_DIR: str
- LEGACY_DIR: str
TEMPLATES_DIR: str
ARCHIVE_DIR: str
SOURCES_DIR: str
diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py
index d131d3e8..23fe3286 100644
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -1,9 +1,7 @@
-
-from datetime import datetime
-
from django.contrib import admin
-from .models import Page
+from core.models import Page
+
class PageAdmin(admin.ModelAdmin):
list_display = ('timestamp', 'short_url', 'title', 'is_archived', 'num_outputs', 'added', 'updated', 'url_hash')
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 9c82c61d..a41f3d1c 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -4,8 +4,8 @@ import uuid
from django.db import models
-from legacy.schema import Link
-from legacy.util import parse_date
+from ..util import parse_date
+from ..index.schema import Link
class Page(models.Model):
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 90d54080..7411ab9c 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -2,8 +2,8 @@ from django.shortcuts import render
from django.views import View
-from legacy.config import OUTPUT_DIR
-from legacy.index import load_main_index, load_main_index_meta
+from .index import load_main_index, load_main_index_meta
+from .config import OUTPUT_DIR
class MainIndex(View):
@@ -34,7 +34,7 @@ class AddLinks(View):
def post(self, request):
import_path = request.POST['url']
- # TODO: add the links to the index here using archivebox.legacy.main.update_archive_data
+ # TODO: add the links to the index here using archivebox.main.add
print(f'Adding URL: {import_path}')
return render(template_name=self.template, request=request, context={})
diff --git a/archivebox/core/welcome_message.py b/archivebox/core/welcome_message.py
index b3a9ebf8..70410c75 100644
--- a/archivebox/core/welcome_message.py
+++ b/archivebox/core/welcome_message.py
@@ -1,4 +1,17 @@
-print()
-print('[i] Welcome to the ArchiveBox Shell! Example usage:')
-print(' Page.objects.all()')
-print(' User.objects.all()')
+from cli import list_subcommands
+
+from .config import ANSI
+
+
+if __name__ == '__main__':
+ print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
+ # print('from archivebox.core.models import Page, User')
+ print('{green}from archivebox.cli import\narchivebox_{}{reset}'.format("\narchivebox_".join(list_subcommands().keys()), **ANSI))
+ print()
+ print('[i] Welcome to the ArchiveBox Shell! Example use:')
+ print(' print(Page.objects.filter(is_archived=True).count())')
+ print(' Page.objects.get(url="https://example.com").as_json()')
+
+ print(' Page.objects.get(url="https://example.com").as_json()')
+
+ print(' from archivebox.main import get_invalid_folders')
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
new file mode 100644
index 00000000..a8f28ce1
--- /dev/null
+++ b/archivebox/extractors/__init__.py
@@ -0,0 +1,105 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+from datetime import datetime
+
+from ..index.schema import Link
+from ..index import (
+ load_link_details,
+ write_link_details,
+ patch_main_index,
+)
+from ..util import enforce_types
+from ..cli.logging import (
+ log_link_archiving_started,
+ log_link_archiving_finished,
+ log_archive_method_started,
+ log_archive_method_finished,
+)
+
+from .title import should_save_title, save_title
+from .favicon import should_save_favicon, save_favicon
+from .wget import should_save_wget, save_wget
+from .pdf import should_save_pdf, save_pdf
+from .screenshot import should_save_screenshot, save_screenshot
+from .dom import should_save_dom, save_dom
+from .git import should_save_git, save_git
+from .media import should_save_media, save_media
+from .archive_org import should_save_archive_dot_org, save_archive_dot_org
+
+
+@enforce_types
+def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None) -> Link:
+ """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
+
+ ARCHIVE_METHODS = (
+ ('title', should_save_title, save_title),
+ ('favicon', should_save_favicon, save_favicon),
+ ('wget', should_save_wget, save_wget),
+ ('pdf', should_save_pdf, save_pdf),
+ ('screenshot', should_save_screenshot, save_screenshot),
+ ('dom', should_save_dom, save_dom),
+ ('git', should_save_git, save_git),
+ ('media', should_save_media, save_media),
+ ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
+ )
+
+ out_dir = out_dir or link.link_dir
+ try:
+ is_new = not os.path.exists(out_dir)
+ if is_new:
+ os.makedirs(out_dir)
+
+ link = load_link_details(link, out_dir=out_dir)
+ log_link_archiving_started(link, out_dir, is_new)
+ link = link.overwrite(updated=datetime.now())
+ stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
+
+ for method_name, should_run, method_function in ARCHIVE_METHODS:
+ try:
+ if method_name not in link.history:
+ link.history[method_name] = []
+
+ if should_run(link, out_dir) or overwrite:
+ log_archive_method_started(method_name)
+
+ result = method_function(link=link, out_dir=out_dir)
+
+ link.history[method_name].append(result)
+
+ stats[result.status] += 1
+ log_archive_method_finished(result)
+ else:
+ stats['skipped'] += 1
+ except Exception as e:
+ raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
+ method_name,
+ link.url,
+ )) from e
+
+ # print(' ', stats)
+
+ write_link_details(link, out_dir=link.link_dir)
+ patch_main_index(link)
+
+ # # If any changes were made, update the main links index json and html
+ # was_changed = stats['succeeded'] or stats['failed']
+ # if was_changed:
+ # patch_main_index(link)
+
+ log_link_archiving_finished(link, link.link_dir, is_new, stats)
+
+ except KeyboardInterrupt:
+ try:
+ write_link_details(link, out_dir=link.link_dir)
+ except:
+ pass
+ raise
+
+ except Exception as err:
+ print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
+ raise
+
+ return link
diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py
new file mode 100644
index 00000000..ad6d409b
--- /dev/null
+++ b/archivebox/extractors/archive_org.py
@@ -0,0 +1,115 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional, List, Dict, Tuple
+from collections import defaultdict
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+ enforce_types,
+ TimedProgress,
+ run,
+ PIPE,
+ DEVNULL,
+ is_static_file,
+ ArchiveError,
+ chmod_file,
+)
+from ..config import (
+ VERSION,
+ TIMEOUT,
+ SAVE_ARCHIVE_DOT_ORG,
+ CURL_BINARY,
+ CURL_VERSION,
+ CHECK_SSL_VALIDITY
+)
+
+
+
+@enforce_types
+def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
+ out_dir = out_dir or link.link_dir
+ if is_static_file(link.url):
+ return False
+
+ if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
+ # if open(path, 'r').read().strip() != 'None':
+ return False
+
+ return SAVE_ARCHIVE_DOT_ORG
+
+@enforce_types
+def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+ """submit site to archive.org for archiving via their service, save returned archive url"""
+
+ out_dir = out_dir or link.link_dir
+ output: ArchiveOutput = 'archive.org.txt'
+ archive_org_url = None
+ submit_url = 'https://web.archive.org/save/{}'.format(link.url)
+ cmd = [
+ CURL_BINARY,
+ '--location',
+ '--head',
+ '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
+ '--max-time', str(timeout),
+ *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+ submit_url,
+ ]
+ status = 'succeeded'
+ timer = TimedProgress(timeout, prefix=' ')
+ try:
+ result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
+ content_location, errors = parse_archive_dot_org_response(result.stdout)
+ if content_location:
+ archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
+ elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
+ archive_org_url = None
+ # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
+ elif errors:
+ raise ArchiveError(', '.join(errors))
+ else:
+ raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
+ except Exception as err:
+ status = 'failed'
+ output = err
+ finally:
+ timer.end()
+
+ if output and not isinstance(output, Exception):
+ # instead of writing None when archive.org rejects the url write the
+ # url to resubmit it to archive.org. This is so when the user visits
+ # the URL in person, it will attempt to re-archive it, and it'll show the
+ # nicer error message explaining why the url was rejected if it fails.
+ archive_org_url = archive_org_url or submit_url
+ with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
+ f.write(archive_org_url)
+ chmod_file('archive.org.txt', cwd=out_dir)
+ output = archive_org_url
+
+ return ArchiveResult(
+ cmd=cmd,
+ pwd=out_dir,
+ cmd_version=CURL_VERSION,
+ output=output,
+ status=status,
+ **timer.stats,
+ )
+
+@enforce_types
+def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
+ # Parse archive.org response headers
+ headers: Dict[str, List[str]] = defaultdict(list)
+
+ # lowercase all the header names and store in dict
+ for header in response.splitlines():
+ if b':' not in header or not header.strip():
+ continue
+ name, val = header.decode().split(':', 1)
+ headers[name.lower().strip()].append(val.strip())
+
+ # Get successful archive url in "content-location" header or any errors
+ content_location = headers['content-location']
+ errors = headers['x-archive-wayback-runtime-error']
+ return content_location, errors
+
diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py
new file mode 100644
index 00000000..a002302f
--- /dev/null
+++ b/archivebox/extractors/dom.py
@@ -0,0 +1,73 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+ enforce_types,
+ TimedProgress,
+ run,
+ PIPE,
+ is_static_file,
+ ArchiveError,
+ chrome_args,
+ chmod_file,
+)
+from ..config import (
+ TIMEOUT,
+ SAVE_DOM,
+ CHROME_VERSION,
+)
+
+
+
+@enforce_types
+def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
+ out_dir = out_dir or link.link_dir
+ if is_static_file(link.url):
+ return False
+
+ if os.path.exists(os.path.join(out_dir, 'output.html')):
+ return False
+
+ return SAVE_DOM
+
+@enforce_types
+def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+ """print HTML of site to file using chrome --dump-html"""
+
+ out_dir = out_dir or link.link_dir
+ output: ArchiveOutput = 'output.html'
+ output_path = os.path.join(out_dir, str(output))
+ cmd = [
+ *chrome_args(TIMEOUT=timeout),
+ '--dump-dom',
+ link.url
+ ]
+ status = 'succeeded'
+ timer = TimedProgress(timeout, prefix=' ')
+ try:
+ with open(output_path, 'w+') as f:
+ result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
+
+ if result.returncode:
+ hints = result.stderr.decode()
+ raise ArchiveError('Failed to save DOM', hints)
+
+ chmod_file(output, cwd=out_dir)
+ except Exception as err:
+ status = 'failed'
+ output = err
+ finally:
+ timer.end()
+
+ return ArchiveResult(
+ cmd=cmd,
+ pwd=out_dir,
+ cmd_version=CHROME_VERSION,
+ output=output,
+ status=status,
+ **timer.stats,
+ )
diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py
new file mode 100644
index 00000000..0dff3900
--- /dev/null
+++ b/archivebox/extractors/favicon.py
@@ -0,0 +1,65 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+ enforce_types,
+ TimedProgress,
+ domain,
+ run,
+ PIPE,
+ chmod_file,
+)
+from ..config import (
+ TIMEOUT,
+ SAVE_FAVICON,
+ CURL_BINARY,
+ CURL_VERSION,
+ CHECK_SSL_VALIDITY,
+)
+
+
+@enforce_types
+def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
+ out_dir = out_dir or link.link_dir
+ if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
+ return False
+
+ return SAVE_FAVICON
+
+@enforce_types
+def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+ """download site favicon from google's favicon api"""
+
+ out_dir = out_dir or link.link_dir
+ output: ArchiveOutput = 'favicon.ico'
+ cmd = [
+ CURL_BINARY,
+ '--max-time', str(timeout),
+ '--location',
+ '--output', str(output),
+ *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+ 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
+ ]
+ status = 'succeeded'
+ timer = TimedProgress(timeout, prefix=' ')
+ try:
+ run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+ chmod_file(output, cwd=out_dir)
+ except Exception as err:
+ status = 'failed'
+ output = err
+ finally:
+ timer.end()
+
+ return ArchiveResult(
+ cmd=cmd,
+ pwd=out_dir,
+ cmd_version=CURL_VERSION,
+ output=output,
+ status=status,
+ **timer.stats,
+ )
diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py
new file mode 100644
index 00000000..21a86f5e
--- /dev/null
+++ b/archivebox/extractors/git.py
@@ -0,0 +1,94 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+ enforce_types,
+ TimedProgress,
+ run,
+ PIPE,
+ is_static_file,
+ ArchiveError,
+ chmod_file,
+ domain,
+ extension,
+ without_query,
+ without_fragment,
+)
+from ..config import (
+ TIMEOUT,
+ SAVE_GIT,
+ GIT_BINARY,
+ GIT_VERSION,
+ GIT_DOMAINS,
+ CHECK_SSL_VALIDITY
+)
+
+
+
+@enforce_types
+def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
+ out_dir = out_dir or link.link_dir
+ if is_static_file(link.url):
+ return False
+
+ if os.path.exists(os.path.join(out_dir, 'git')):
+ return False
+
+ is_clonable_url = (
+ (domain(link.url) in GIT_DOMAINS)
+ or (extension(link.url) == 'git')
+ )
+ if not is_clonable_url:
+ return False
+
+ return SAVE_GIT
+
+
+@enforce_types
+def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+ """download full site using git"""
+
+ out_dir = out_dir or link.link_dir
+ output: ArchiveOutput = 'git'
+ output_path = os.path.join(out_dir, str(output))
+ os.makedirs(output_path, exist_ok=True)
+ cmd = [
+ GIT_BINARY,
+ 'clone',
+ '--mirror',
+ '--recursive',
+ *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
+ without_query(without_fragment(link.url)),
+ ]
+ status = 'succeeded'
+ timer = TimedProgress(timeout, prefix=' ')
+ try:
+ result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
+
+ if result.returncode == 128:
+ # ignore failed re-download when the folder already exists
+ pass
+ elif result.returncode > 0:
+ hints = 'Got git response code: {}.'.format(result.returncode)
+ raise ArchiveError('Failed to save git clone', hints)
+
+ chmod_file(output, cwd=out_dir)
+
+ except Exception as err:
+ status = 'failed'
+ output = err
+ finally:
+ timer.end()
+
+ return ArchiveResult(
+ cmd=cmd,
+ pwd=out_dir,
+ cmd_version=GIT_VERSION,
+ output=output,
+ status=status,
+ **timer.stats,
+ )
diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py
new file mode 100644
index 00000000..9fd9a9be
--- /dev/null
+++ b/archivebox/extractors/media.py
@@ -0,0 +1,100 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+ enforce_types,
+ TimedProgress,
+ run,
+ PIPE,
+ is_static_file,
+ ArchiveError,
+ chmod_file,
+)
+from ..config import (
+ MEDIA_TIMEOUT,
+ SAVE_MEDIA,
+ YOUTUBEDL_BINARY,
+ YOUTUBEDL_VERSION,
+ CHECK_SSL_VALIDITY
+)
+
+
+@enforce_types
+def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
+ out_dir = out_dir or link.link_dir
+
+ if is_static_file(link.url):
+ return False
+
+ if os.path.exists(os.path.join(out_dir, 'media')):
+ return False
+
+ return SAVE_MEDIA
+
+@enforce_types
+def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
+ """Download playlists or individual video, audio, and subtitles using youtube-dl"""
+
+ out_dir = out_dir or link.link_dir
+ output: ArchiveOutput = 'media'
+ output_path = os.path.join(out_dir, str(output))
+ os.makedirs(output_path, exist_ok=True)
+ cmd = [
+ YOUTUBEDL_BINARY,
+ '--write-description',
+ '--write-info-json',
+ '--write-annotations',
+ '--yes-playlist',
+ '--write-thumbnail',
+ '--no-call-home',
+ '--no-check-certificate',
+ '--user-agent',
+ '--all-subs',
+ '--extract-audio',
+ '--keep-video',
+ '--ignore-errors',
+ '--geo-bypass',
+ '--audio-format', 'mp3',
+ '--audio-quality', '320K',
+ '--embed-thumbnail',
+ '--add-metadata',
+ *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
+ link.url,
+ ]
+ status = 'succeeded'
+ timer = TimedProgress(timeout, prefix=' ')
+ try:
+ result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
+ chmod_file(output, cwd=out_dir)
+ if result.returncode:
+ if (b'ERROR: Unsupported URL' in result.stderr
+ or b'HTTP Error 404' in result.stderr
+ or b'HTTP Error 403' in result.stderr
+ or b'URL could be a direct video link' in result.stderr
+ or b'Unable to extract container ID' in result.stderr):
+ # These happen too frequently on non-media pages to warrant printing to console
+ pass
+ else:
+ hints = (
+ 'Got youtube-dl response code: {}.'.format(result.returncode),
+ *result.stderr.decode().split('\n'),
+ )
+ raise ArchiveError('Failed to save media', hints)
+ except Exception as err:
+ status = 'failed'
+ output = err
+ finally:
+ timer.end()
+
+ return ArchiveResult(
+ cmd=cmd,
+ pwd=out_dir,
+ cmd_version=YOUTUBEDL_VERSION,
+ output=output,
+ status=status,
+ **timer.stats,
+ )
diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py
new file mode 100644
index 00000000..e7ade948
--- /dev/null
+++ b/archivebox/extractors/pdf.py
@@ -0,0 +1,72 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+ enforce_types,
+ TimedProgress,
+ run,
+ PIPE,
+ is_static_file,
+ ArchiveError,
+ chrome_args,
+ chmod_file,
+)
+from ..config import (
+ TIMEOUT,
+ SAVE_PDF,
+ CHROME_VERSION,
+)
+
+
+
+@enforce_types
+def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
+ out_dir = out_dir or link.link_dir
+ if is_static_file(link.url):
+ return False
+
+ if os.path.exists(os.path.join(out_dir, 'output.pdf')):
+ return False
+
+ return SAVE_PDF
+
+
+@enforce_types
+def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+ """print PDF of site to file using chrome --headless"""
+
+ out_dir = out_dir or link.link_dir
+ output: ArchiveOutput = 'output.pdf'
+ cmd = [
+ *chrome_args(TIMEOUT=timeout),
+ '--print-to-pdf',
+ link.url,
+ ]
+ status = 'succeeded'
+ timer = TimedProgress(timeout, prefix=' ')
+ try:
+ result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+
+ if result.returncode:
+ hints = (result.stderr or result.stdout).decode()
+ raise ArchiveError('Failed to save PDF', hints)
+
+ chmod_file('output.pdf', cwd=out_dir)
+ except Exception as err:
+ status = 'failed'
+ output = err
+ finally:
+ timer.end()
+
+ return ArchiveResult(
+ cmd=cmd,
+ pwd=out_dir,
+ cmd_version=CHROME_VERSION,
+ output=output,
+ status=status,
+ **timer.stats,
+ )
diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py
new file mode 100644
index 00000000..3e211939
--- /dev/null
+++ b/archivebox/extractors/screenshot.py
@@ -0,0 +1,71 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+ enforce_types,
+ TimedProgress,
+ run,
+ PIPE,
+ is_static_file,
+ ArchiveError,
+ chrome_args,
+ chmod_file,
+)
+from ..config import (
+ TIMEOUT,
+ SAVE_SCREENSHOT,
+ CHROME_VERSION,
+)
+
+
+
+@enforce_types
+def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
+ out_dir = out_dir or link.link_dir
+ if is_static_file(link.url):
+ return False
+
+ if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
+ return False
+
+ return SAVE_SCREENSHOT
+
+@enforce_types
+def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+ """take screenshot of site using chrome --headless"""
+
+ out_dir = out_dir or link.link_dir
+ output: ArchiveOutput = 'screenshot.png'
+ cmd = [
+ *chrome_args(TIMEOUT=timeout),
+ '--screenshot',
+ link.url,
+ ]
+ status = 'succeeded'
+ timer = TimedProgress(timeout, prefix=' ')
+ try:
+ result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+
+ if result.returncode:
+ hints = (result.stderr or result.stdout).decode()
+ raise ArchiveError('Failed to save screenshot', hints)
+
+ chmod_file(output, cwd=out_dir)
+ except Exception as err:
+ status = 'failed'
+ output = err
+ finally:
+ timer.end()
+
+ return ArchiveResult(
+ cmd=cmd,
+ pwd=out_dir,
+ cmd_version=CHROME_VERSION,
+ output=output,
+ status=status,
+ **timer.stats,
+ )
diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
new file mode 100644
index 00000000..c8ba9dd3
--- /dev/null
+++ b/archivebox/extractors/title.py
@@ -0,0 +1,63 @@
+__package__ = 'archivebox.extractors'
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+ enforce_types,
+ TimedProgress,
+ is_static_file,
+ ArchiveError,
+ fetch_page_title,
+)
+from ..config import (
+ TIMEOUT,
+ SAVE_TITLE,
+ CURL_BINARY,
+ CURL_VERSION,
+)
+
+
+@enforce_types
+def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
+ # if link already has valid title, skip it
+ if link.title and not link.title.lower().startswith('http'):
+ return False
+
+ if is_static_file(link.url):
+ return False
+
+ return SAVE_TITLE
+
+@enforce_types
+def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+ """try to guess the page's title from its content"""
+
+ output: ArchiveOutput = None
+ cmd = [
+ CURL_BINARY,
+ link.url,
+ '|',
+ 'grep',
+ '
bool:
+ output_path = wget_output_path(link)
+ out_dir = out_dir or link.link_dir
+ if output_path and os.path.exists(os.path.join(out_dir, output_path)):
+ return False
+
+ return SAVE_WGET
+
+
+@enforce_types
+def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+ """download full site using wget"""
+
+ out_dir = out_dir or link.link_dir
+ if SAVE_WARC:
+ warc_dir = os.path.join(out_dir, 'warc')
+ os.makedirs(warc_dir, exist_ok=True)
+ warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
+
+ # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
+ output: ArchiveOutput = None
+ cmd = [
+ WGET_BINARY,
+ # '--server-response', # print headers for better error parsing
+ '--no-verbose',
+ '--adjust-extension',
+ '--convert-links',
+ '--force-directories',
+ '--backup-converted',
+ '--span-hosts',
+ '--no-parent',
+ '-e', 'robots=off',
+ '--restrict-file-names=windows',
+ '--timeout={}'.format(timeout),
+ *([] if SAVE_WARC else ['--timestamping']),
+ *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
+ *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
+ *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
+ *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
+ *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
+ *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
+ link.url,
+ ]
+ status = 'succeeded'
+ timer = TimedProgress(timeout, prefix=' ')
+ try:
+ result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+ output = wget_output_path(link)
+
+ # parse out number of files downloaded from last line of stderr:
+ # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
+ output_tail = [
+ line.strip()
+ for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
+ if line.strip()
+ ]
+ files_downloaded = (
+ int(output_tail[-1].strip().split(' ', 2)[1] or 0)
+ if 'Downloaded:' in output_tail[-1]
+ else 0
+ )
+
+ # Check for common failure cases
+ if result.returncode > 0 and files_downloaded < 1:
+ hints = (
+ 'Got wget response code: {}.'.format(result.returncode),
+ *output_tail,
+ )
+ if b'403: Forbidden' in result.stderr:
+ raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
+ if b'404: Not Found' in result.stderr:
+ raise ArchiveError('404 Not Found', hints)
+ if b'ERROR 500: Internal Server Error' in result.stderr:
+ raise ArchiveError('500 Internal Server Error', hints)
+ raise ArchiveError('Got an error from the server', hints)
+
+ # chmod_file(output, cwd=out_dir)
+ except Exception as err:
+ status = 'failed'
+ output = err
+ finally:
+ timer.end()
+
+ return ArchiveResult(
+ cmd=cmd,
+ pwd=out_dir,
+ cmd_version=WGET_VERSION,
+ output=output,
+ status=status,
+ **timer.stats,
+ )
diff --git a/archivebox/legacy/index.py b/archivebox/index/__init__.py
similarity index 51%
rename from archivebox/legacy/index.py
rename to archivebox/index/__init__.py
index 29b355db..d7b6b43e 100644
--- a/archivebox/legacy/index.py
+++ b/archivebox/index/__init__.py
@@ -1,14 +1,25 @@
-__package__ = 'archivebox.legacy'
+__package__ = 'archivebox.index'
+import re
import os
-import json
+import shutil
+import json as pyjson
-from typing import List, Tuple, Optional, Iterable
+from itertools import chain
+from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict
from contextlib import contextmanager
-from .schema import Link, ArchiveResult
-from .config import (
+from ..parsers import parse_links
+from ..util import (
+ scheme,
+ enforce_types,
+ TimedProgress,
+ atomic_write,
+ ExtendedEncoder,
+)
+from ..config import (
+ ARCHIVE_DIR_NAME,
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
@@ -18,26 +29,7 @@ from .config import (
ANSI,
stderr,
)
-from .storage.html import write_html_main_index, write_html_link_details
-from .storage.json import (
- parse_json_main_index,
- write_json_main_index,
- parse_json_link_details,
- write_json_link_details,
-)
-from .storage.sql import (
- write_sql_main_index,
- parse_sql_main_index,
-)
-from .util import (
- scheme,
- enforce_types,
- TimedProgress,
- atomic_write,
- ExtendedEncoder,
-)
-from .parse import parse_links
-from .logs import (
+from ..cli.logging import (
log_indexing_process_started,
log_indexing_process_finished,
log_indexing_started,
@@ -46,6 +38,22 @@ from .logs import (
log_parsing_finished,
)
+from .schema import Link, ArchiveResult
+from .html import (
+ write_html_main_index,
+ write_html_link_details,
+)
+from .json import (
+ parse_json_main_index,
+ write_json_main_index,
+ parse_json_link_details,
+ write_json_link_details,
+)
+from .sql import (
+ write_sql_main_index,
+ parse_sql_main_index,
+)
+
### Link filtering and checking
@enforce_types
@@ -95,11 +103,11 @@ def merge_links(a: Link, b: Link) -> Link:
}
for method in all_methods:
deduped_jsons = {
- json.dumps(result, sort_keys=True, cls=ExtendedEncoder)
+ pyjson.dumps(result, sort_keys=True, cls=ExtendedEncoder)
for result in history[method]
}
history[method] = list(reversed(sorted(
- (ArchiveResult.from_json(json.loads(result)) for result in deduped_jsons),
+ (ArchiveResult.from_json(pyjson.loads(result)) for result in deduped_jsons),
key=lambda result: result.start_ts,
)))
@@ -114,7 +122,7 @@ def merge_links(a: Link, b: Link) -> Link:
@enforce_types
-def validate_links(links: Iterable[Link]) -> Iterable[Link]:
+def validate_links(links: Iterable[Link]) -> List[Link]:
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timstamp, url
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
@@ -128,7 +136,7 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
stderr(' archivebox help')
raise SystemExit(1)
- return links
+ return list(links)
@enforce_types
@@ -259,23 +267,32 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
- meta_dict = json.load(f)
+ meta_dict = pyjson.load(f)
meta_dict.pop('links')
return meta_dict
return None
@enforce_types
-def import_new_links(existing_links: List[Link], import_path: str) -> Tuple[List[Link], List[Link]]:
+def import_new_links(existing_links: List[Link],
+ import_path: str,
+ out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:
+
new_links: List[Link] = []
# parse and validate the import file
log_parsing_started(import_path)
raw_links, parser_name = parse_links(import_path)
- new_links = list(validate_links(raw_links))
+ new_links = validate_links(raw_links)
# merge existing links in out_dir and new links
- all_links = list(validate_links(existing_links + new_links))
+ all_links = validate_links(existing_links + new_links)
+ all_link_urls = {link.url for link in existing_links}
+
+ new_links = [
+ link for link in new_links
+ if link.url not in all_link_urls
+ ]
if parser_name:
num_parsed = len(raw_links)
@@ -345,3 +362,231 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
return merge_links(existing_link, link)
return link
+
+
+
+LINK_FILTERS = {
+ 'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
+ 'substring': lambda link, pattern: pattern in link.url,
+ 'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
+ 'domain': lambda link, pattern: link.domain == pattern,
+}
+
+@enforce_types
+def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
+ for pattern in filter_patterns:
+ try:
+ if LINK_FILTERS[filter_type](link, pattern):
+ return True
+ except Exception:
+ stderr()
+ stderr(
+ f'[X] Got invalid pattern for --filter-type={filter_type}:',
+ color='red',
+ )
+ stderr(f' {pattern}')
+ raise SystemExit(2)
+
+ return False
+
+
+def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+ """indexed links without checking archive status or data directory validity"""
+ return {
+ link.link_dir: link
+ for link in links
+ }
+
+def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+ """indexed links that are archived with a valid data directory"""
+ return {
+ link.link_dir: link
+ for link in filter(is_archived, links)
+ }
+
+def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+ """indexed links that are unarchived with no data directory or an empty data directory"""
+ return {
+ link.link_dir: link
+ for link in filter(is_unarchived, links)
+ }
+
+def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+ """dirs that are expected to exist based on the main index"""
+ all_folders = {}
+
+ for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
+ if entry.is_dir(follow_symlinks=True):
+ link = None
+ try:
+ link = parse_json_link_details(entry.path)
+ except Exception:
+ pass
+
+ all_folders[entry.path] = link
+
+ return all_folders
+
+def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+ """dirs with a valid index matched to the main index and archived content"""
+ return {
+ link.link_dir: link
+ for link in filter(is_valid, links)
+ }
+
+def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+ """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
+ duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
+ orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
+ corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
+ unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
+ return {**duplicate, **orphaned, **corrupted, **unrecognized}
+
+
+def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+ """dirs that conflict with other directories that have the same link URL or timestamp"""
+ links = list(links)
+ by_url = {link.url: 0 for link in links}
+ by_timestamp = {link.timestamp: 0 for link in links}
+
+ duplicate_folders = {}
+
+ indexed_folders = {link.link_dir for link in links}
+ data_folders = (
+ entry.path
+ for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
+ if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
+ )
+
+ for path in chain(sorted(indexed_folders), sorted(data_folders)):
+ link = None
+ try:
+ link = parse_json_link_details(path)
+ except Exception:
+ pass
+
+ if link:
+ # link folder has same timestamp as different link folder
+ by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
+ if by_timestamp[link.timestamp] > 1:
+ duplicate_folders[path] = link
+
+ # link folder has same url as different link folder
+ by_url[link.url] = by_url.get(link.url, 0) + 1
+ if by_url[link.url] > 1:
+ duplicate_folders[path] = link
+
+ return duplicate_folders
+
+def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+ """dirs that contain a valid index but aren't listed in the main index"""
+ links = list(links)
+ indexed_folders = {link.link_dir: link for link in links}
+ orphaned_folders = {}
+
+ for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
+ if entry.is_dir(follow_symlinks=True):
+ link = None
+ try:
+ link = parse_json_link_details(entry.path)
+ except Exception:
+ pass
+
+ if link and entry.path not in indexed_folders:
+ # folder is a valid link data dir with index details, but it's not in the main index
+ orphaned_folders[entry.path] = link
+
+ return orphaned_folders
+
+def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+ """dirs that don't contain a valid index and aren't listed in the main index"""
+ return {
+ link.link_dir: link
+ for link in filter(is_corrupt, links)
+ }
+
+def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+ """dirs that don't contain recognizable archive data and aren't listed in the main index"""
+ by_timestamp = {link.timestamp: 0 for link in links}
+ unrecognized_folders: Dict[str, Optional[Link]] = {}
+
+ for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
+ if entry.is_dir(follow_symlinks=True):
+ index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
+ link = None
+ try:
+ link = parse_json_link_details(entry.path)
+ except Exception:
+ pass
+
+ if index_exists and link is None:
+ # index exists but it's corrupted or unparseable
+ unrecognized_folders[entry.path] = link
+
+ elif not index_exists:
+ # link details index doesn't exist and the folder isn't in the main index
+ timestamp = entry.path.rsplit('/', 1)[-1]
+ if timestamp not in by_timestamp:
+ unrecognized_folders[entry.path] = link
+
+ return unrecognized_folders
+
+
+def is_valid(link: Link) -> bool:
+ dir_exists = os.path.exists(link.link_dir)
+ index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
+ if not dir_exists:
+ # unarchived links are not included in the valid list
+ return False
+ if dir_exists and not index_exists:
+ return False
+ if dir_exists and index_exists:
+ try:
+ parsed_link = parse_json_link_details(link.link_dir)
+ return link.url == parsed_link.url
+ except Exception:
+ pass
+ return False
+
+def is_corrupt(link: Link) -> bool:
+ if not os.path.exists(link.link_dir):
+ # unarchived links are not considered corrupt
+ return False
+
+ if is_valid(link):
+ return False
+
+ return True
+
+def is_archived(link: Link) -> bool:
+ return is_valid(link) and link.is_archived
+
+def is_unarchived(link: Link) -> bool:
+ if not os.path.exists(link.link_dir):
+ return True
+ return not link.is_archived
+
+
+def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
+ fixed = []
+ cant_fix = []
+ for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
+ if entry.is_dir(follow_symlinks=True):
+ if os.path.exists(os.path.join(entry.path, 'index.json')):
+ link = parse_json_link_details(entry.path)
+ if not link:
+ continue
+
+ if not entry.path.endswith(f'/{link.timestamp}'):
+ dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp)
+ if os.path.exists(dest):
+ cant_fix.append(entry.path)
+ else:
+ shutil.move(entry.path, dest)
+ fixed.append(dest)
+
+ if link.link_dir != entry.path:
+ link = link.overwrite(link_dir=entry.path)
+ write_json_link_details(link, out_dir=entry.path)
+
+ return fixed, cant_fix
diff --git a/archivebox/legacy/storage/html.py b/archivebox/index/html.py
similarity index 98%
rename from archivebox/legacy/storage/html.py
rename to archivebox/index/html.py
index 545c06de..3cba2bf0 100644
--- a/archivebox/legacy/storage/html.py
+++ b/archivebox/index/html.py
@@ -1,11 +1,22 @@
-__package__ = 'archivebox.legacy.storage'
+__package__ = 'archivebox.index'
import os
from datetime import datetime
from typing import List, Optional, Iterator
-from ..schema import Link
+from .schema import Link
+from ..util import (
+ enforce_types,
+ ts_to_date,
+ urlencode,
+ htmlencode,
+ urldecode,
+ wget_output_path,
+ render_template,
+ atomic_write,
+ copy_and_overwrite,
+)
from ..config import (
OUTPUT_DIR,
TEMPLATES_DIR,
@@ -18,17 +29,6 @@ from ..config import (
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
)
-from ..util import (
- enforce_types,
- ts_to_date,
- urlencode,
- htmlencode,
- urldecode,
- wget_output_path,
- render_template,
- atomic_write,
- copy_and_overwrite,
-)
join = lambda *paths: os.path.join(*paths)
MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
diff --git a/archivebox/legacy/storage/json.py b/archivebox/index/json.py
similarity index 90%
rename from archivebox/legacy/storage/json.py
rename to archivebox/index/json.py
index 2ec56fbf..4d75d095 100644
--- a/archivebox/legacy/storage/json.py
+++ b/archivebox/index/json.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox.legacy.storage'
+__package__ = 'archivebox.index'
import os
import sys
@@ -7,7 +7,8 @@ import json
from datetime import datetime
from typing import List, Optional, Iterator
-from ..schema import Link, ArchiveResult
+from .schema import Link, ArchiveResult
+from ..util import enforce_types, atomic_write
from ..config import (
VERSION,
OUTPUT_DIR,
@@ -17,14 +18,11 @@ from ..config import (
JSON_INDEX_FILENAME,
ARCHIVE_DIR_NAME,
)
-from ..util import (
- enforce_types,
- atomic_write,
-)
+
MAIN_INDEX_HEADER = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
- 'schema': 'archivebox.legacy.storage.json',
+ 'schema': 'archivebox.index.json',
'copyright_info': FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
@@ -43,7 +41,7 @@ MAIN_INDEX_HEADER = {
@enforce_types
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
- """parse a archive index json file and return the list of links"""
+ """parse an archive index json file and return the list of links"""
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path):
@@ -110,4 +108,6 @@ def parse_json_links_details(out_dir: str) -> Iterator[Link]:
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
if os.path.exists(os.path.join(entry.path, 'index.json')):
- yield parse_json_link_details(entry.path)
+ link = parse_json_link_details(entry.path)
+ if link:
+ yield link
diff --git a/archivebox/legacy/schema.py b/archivebox/index/schema.py
similarity index 93%
rename from archivebox/legacy/schema.py
rename to archivebox/index/schema.py
index 1020f294..1cec34b1 100644
--- a/archivebox/legacy/schema.py
+++ b/archivebox/index/schema.py
@@ -1,3 +1,5 @@
+__package__ = 'archivebox.index'
+
import os
from datetime import datetime
@@ -48,7 +50,7 @@ class ArchiveResult:
@classmethod
def from_json(cls, json_info):
- from .util import parse_date
+ from ..util import parse_date
info = {
key: val
@@ -60,12 +62,12 @@ class ArchiveResult:
return cls(**info)
def to_json(self, indent=4, sort_keys=True):
- from .util import to_json
+ from ..util import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, cols=None, ljust: int=0, separator: str=','):
- from .util import to_json
+ from ..util import to_json
cols = cols or self.field_names()
return separator.join(
@@ -115,7 +117,7 @@ class Link:
return float(self.timestamp) > float(other.timestamp)
def typecheck(self) -> None:
- from .config import stderr, ANSI
+ from ..config import stderr, ANSI
try:
assert self.schema == self.__class__.__name__
assert isinstance(self.timestamp, str) and self.timestamp
@@ -176,7 +178,7 @@ class Link:
@classmethod
def from_json(cls, json_info):
- from .util import parse_date
+ from ..util import parse_date
info = {
key: val
@@ -200,12 +202,12 @@ class Link:
return cls(**info)
def to_json(self, indent=4, sort_keys=True):
- from .util import to_json
+ from ..util import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','):
- from .util import to_json
+ from ..util import to_json
return separator.join(
to_json(getattr(self, col), indent=None).ljust(ljust)
@@ -218,60 +220,60 @@ class Link:
@property
def link_dir(self) -> str:
- from .config import CONFIG
+ from ..config import CONFIG
return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp)
@property
def archive_path(self) -> str:
- from .config import ARCHIVE_DIR_NAME
+ from ..config import ARCHIVE_DIR_NAME
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
### URL Helpers
@property
def url_hash(self):
- from .util import hashurl
+ from ..util import hashurl
return hashurl(self.url)
@property
def scheme(self) -> str:
- from .util import scheme
+ from ..util import scheme
return scheme(self.url)
@property
def extension(self) -> str:
- from .util import extension
+ from ..util import extension
return extension(self.url)
@property
def domain(self) -> str:
- from .util import domain
+ from ..util import domain
return domain(self.url)
@property
def path(self) -> str:
- from .util import path
+ from ..util import path
return path(self.url)
@property
def basename(self) -> str:
- from .util import basename
+ from ..util import basename
return basename(self.url)
@property
def base_url(self) -> str:
- from .util import base_url
+ from ..util import base_url
return base_url(self.url)
### Pretty Printing Helpers
@property
def bookmarked_date(self) -> Optional[str]:
- from .util import ts_to_date
+ from ..util import ts_to_date
return ts_to_date(self.timestamp) if self.timestamp else None
@property
def updated_date(self) -> Optional[str]:
- from .util import ts_to_date
+ from ..util import ts_to_date
return ts_to_date(self.updated) if self.updated else None
@property
@@ -304,13 +306,13 @@ class Link:
@property
def is_static(self) -> bool:
- from .util import is_static_file
+ from ..util import is_static_file
return is_static_file(self.url)
@property
def is_archived(self) -> bool:
- from .config import ARCHIVE_DIR
- from .util import domain
+ from ..config import ARCHIVE_DIR
+ from ..util import domain
output_paths = (
domain(self.url),
@@ -352,7 +354,7 @@ class Link:
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""predict the expected output paths that should be present after archiving"""
- from .util import wget_output_path
+ from ..util import wget_output_path
canonical = {
'index_path': 'index.html',
'favicon_path': 'favicon.ico',
diff --git a/archivebox/legacy/storage/sql.py b/archivebox/index/sql.py
similarity index 80%
rename from archivebox/legacy/storage/sql.py
rename to archivebox/index/sql.py
index 363be514..942054c2 100644
--- a/archivebox/legacy/storage/sql.py
+++ b/archivebox/index/sql.py
@@ -1,9 +1,9 @@
-__package__ = 'archivebox.legacy.storage'
+__package__ = 'archivebox.index'
from io import StringIO
from typing import List, Tuple, Iterator
-from ..schema import Link
+from .schema import Link
from ..util import enforce_types
from ..config import setup_django, OUTPUT_DIR
@@ -25,9 +25,19 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
setup_django(out_dir, check_db=True)
from core.models import Page
- for link in links:
+ all_urls = {link.url: link for link in links}
+
+ for page in Page.objects.all():
+ if page.url in all_urls:
+ info = {k: v for k, v in all_urls.pop(page.url)._asdict().items() if k in Page.keys}
+ Page.objects.update(**info)
+ else:
+ page.delete()
+
+ for url, link in all_urls.items():
info = {k: v for k, v in link._asdict().items() if k in Page.keys}
- Page.objects.update_or_create(url=link.url, defaults=info)
+ Page.objects.update_or_create(url=url, defaults=info)
+
@enforce_types
diff --git a/archivebox/legacy/ArchiveBox.conf b/archivebox/legacy/ArchiveBox.conf
deleted file mode 100644
index fe7b674c..00000000
--- a/archivebox/legacy/ArchiveBox.conf
+++ /dev/null
@@ -1,58 +0,0 @@
-# This is the example default configiration file for ArchiveBox.
-#
-# Copy example config from here into your project's ArchiveBox.conf file,
-# DO NOT EDIT THIS FILE DIRECTLY!
-#
-# See the list of all the possible options. documentation, and examples here:
-# https://github.com/pirate/ArchiveBox/wiki/Configuration
-
-[GENERAL_CONFIG]
-OUTPUT_PERMISSIONS = 755
-ONLY_NEW = False
-TIMEOUT = 60
-MEDIA_TIMEOUT = 3600
-ACTIVE_THEME = default
-FOOTER_INFO = Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.
-URL_BLACKLIST = (://(.*\.)?facebook\.com)|(://(.*\.)?ebay\.com)|(.*\.exe$)
-
-[ARCHIVE_METHOD_TOGGLES]
-SAVE_TITLE = True
-SAVE_FAVICON = True
-SAVE_WGET = True
-SAVE_WGET_REQUISITES = True
-SAVE_WARC = True
-SAVE_PDF = True
-SAVE_SCREENSHOT = True
-SAVE_DOM = True
-SAVE_GIT = True
-SAVE_MEDIA = False
-SAVE_ARCHIVE_DOT_ORG = True
-
-
-[ARCHIVE_METHOD_OPTIONS]
-CHECK_SSL_VALIDITY = True
-RESOLUTION = 1440,900
-GIT_DOMAINS = github.com,bitbucket.org,gitlab.com
-
-CROME_HEADLESS = True
-CROME_SANDBOX = True
-
-COOKIES_FILE = path/to/cookies.txt
-CHROME_USER_DATA_DIR = ~/.config/google-chrome/Default
-
-WGET_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
-CHROME_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
-
-
-[DEPENDENCY_CONFIG]
-USE_CURL = True
-USE_WGET = True
-USE_CHROME = True
-USE_YOUTUBEDL = True
-USE_GIT = True
-
-CURL_BINARY = curl
-GIT_BINARY = git"
-WGET_BINARY = wget
-YOUTUBEDL_BINARY = youtube-dl
-CHROME_BINARY = chromium
diff --git a/archivebox/legacy/__init__.py b/archivebox/legacy/__init__.py
deleted file mode 100644
index 2bbcd2fc..00000000
--- a/archivebox/legacy/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-__package__ = 'archivebox.legacy'
diff --git a/archivebox/legacy/archive_methods.py b/archivebox/legacy/archive_methods.py
deleted file mode 100644
index de4f8668..00000000
--- a/archivebox/legacy/archive_methods.py
+++ /dev/null
@@ -1,694 +0,0 @@
-import os
-
-from typing import Dict, List, Tuple, Optional
-from collections import defaultdict
-from datetime import datetime
-
-from .schema import Link, ArchiveResult, ArchiveOutput
-from .index import (
- load_link_details,
- write_link_details,
- patch_main_index,
-)
-from .config import (
- CURL_BINARY,
- GIT_BINARY,
- WGET_BINARY,
- YOUTUBEDL_BINARY,
- SAVE_FAVICON,
- SAVE_TITLE,
- SAVE_WGET,
- SAVE_WGET_REQUISITES,
- SAVE_PDF,
- SAVE_SCREENSHOT,
- SAVE_DOM,
- SAVE_WARC,
- SAVE_GIT,
- SAVE_MEDIA,
- SAVE_ARCHIVE_DOT_ORG,
- TIMEOUT,
- MEDIA_TIMEOUT,
- GIT_DOMAINS,
- VERSION,
- WGET_USER_AGENT,
- CHECK_SSL_VALIDITY,
- COOKIES_FILE,
- CURL_VERSION,
- WGET_VERSION,
- CHROME_VERSION,
- GIT_VERSION,
- YOUTUBEDL_VERSION,
- WGET_AUTO_COMPRESSION,
-)
-from .util import (
- enforce_types,
- domain,
- extension,
- without_query,
- without_fragment,
- fetch_page_title,
- is_static_file,
- TimedProgress,
- chmod_file,
- wget_output_path,
- chrome_args,
- run, PIPE, DEVNULL,
-)
-from .logs import (
- log_link_archiving_started,
- log_link_archiving_finished,
- log_archive_method_started,
- log_archive_method_finished,
-)
-
-
-class ArchiveError(Exception):
- def __init__(self, message, hints=None):
- super().__init__(message)
- self.hints = hints
-
-
-@enforce_types
-def archive_link(link: Link, out_dir: Optional[str]=None) -> Link:
- """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
-
- ARCHIVE_METHODS = (
- ('title', should_save_title, save_title),
- ('favicon', should_save_favicon, save_favicon),
- ('wget', should_save_wget, save_wget),
- ('pdf', should_save_pdf, save_pdf),
- ('screenshot', should_save_screenshot, save_screenshot),
- ('dom', should_save_dom, save_dom),
- ('git', should_save_git, save_git),
- ('media', should_save_media, save_media),
- ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
- )
-
- out_dir = out_dir or link.link_dir
- try:
- is_new = not os.path.exists(out_dir)
- if is_new:
- os.makedirs(out_dir)
-
- link = load_link_details(link, out_dir=out_dir)
- log_link_archiving_started(link, out_dir, is_new)
- link = link.overwrite(updated=datetime.now())
- stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
-
- for method_name, should_run, method_function in ARCHIVE_METHODS:
- try:
- if method_name not in link.history:
- link.history[method_name] = []
-
- if should_run(link, out_dir):
- log_archive_method_started(method_name)
-
- result = method_function(link=link, out_dir=out_dir)
-
- link.history[method_name].append(result)
-
- stats[result.status] += 1
- log_archive_method_finished(result)
- else:
- stats['skipped'] += 1
- except Exception as e:
- raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
- method_name,
- link.url,
- )) from e
-
- # print(' ', stats)
-
- write_link_details(link, out_dir=link.link_dir)
- patch_main_index(link)
-
- # # If any changes were made, update the main links index json and html
- # was_changed = stats['succeeded'] or stats['failed']
- # if was_changed:
- # patch_main_index(link)
-
- log_link_archiving_finished(link, link.link_dir, is_new, stats)
-
- except KeyboardInterrupt:
- try:
- write_link_details(link, out_dir=link.link_dir)
- except:
- pass
- raise
-
- except Exception as err:
- print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
- raise
-
- return link
-
-
-### Archive Method Functions
-
-@enforce_types
-def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
- # if link already has valid title, skip it
- if link.title and not link.title.lower().startswith('http'):
- return False
-
- if is_static_file(link.url):
- return False
-
- return SAVE_TITLE
-
-@enforce_types
-def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
- """try to guess the page's title from its content"""
-
- output: ArchiveOutput = None
- cmd = [
- CURL_BINARY,
- link.url,
- '|',
- 'grep',
- ' bool:
- out_dir = out_dir or link.link_dir
- if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
- return False
-
- return SAVE_FAVICON
-
-@enforce_types
-def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
- """download site favicon from google's favicon api"""
-
- out_dir = out_dir or link.link_dir
- output: ArchiveOutput = 'favicon.ico'
- cmd = [
- CURL_BINARY,
- '--max-time', str(timeout),
- '--location',
- '--output', str(output),
- *([] if CHECK_SSL_VALIDITY else ['--insecure']),
- 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
- ]
- status = 'succeeded'
- timer = TimedProgress(timeout, prefix=' ')
- try:
- run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
- chmod_file(output, cwd=out_dir)
- except Exception as err:
- status = 'failed'
- output = err
- finally:
- timer.end()
-
- return ArchiveResult(
- cmd=cmd,
- pwd=out_dir,
- cmd_version=CURL_VERSION,
- output=output,
- status=status,
- **timer.stats,
- )
-
-@enforce_types
-def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
- output_path = wget_output_path(link)
- out_dir = out_dir or link.link_dir
- if output_path and os.path.exists(os.path.join(out_dir, output_path)):
- return False
-
- return SAVE_WGET
-
-
-@enforce_types
-def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
- """download full site using wget"""
-
- out_dir = out_dir or link.link_dir
- if SAVE_WARC:
- warc_dir = os.path.join(out_dir, 'warc')
- os.makedirs(warc_dir, exist_ok=True)
- warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
-
- # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
- output: ArchiveOutput = None
- cmd = [
- WGET_BINARY,
- # '--server-response', # print headers for better error parsing
- '--no-verbose',
- '--adjust-extension',
- '--convert-links',
- '--force-directories',
- '--backup-converted',
- '--span-hosts',
- '--no-parent',
- '-e', 'robots=off',
- '--restrict-file-names=windows',
- '--timeout={}'.format(timeout),
- *([] if SAVE_WARC else ['--timestamping']),
- *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
- *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
- *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
- *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
- *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
- *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
- link.url,
- ]
- status = 'succeeded'
- timer = TimedProgress(timeout, prefix=' ')
- try:
- result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
- output = wget_output_path(link)
-
- # parse out number of files downloaded from last line of stderr:
- # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
- output_tail = [
- line.strip()
- for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
- if line.strip()
- ]
- files_downloaded = (
- int(output_tail[-1].strip().split(' ', 2)[1] or 0)
- if 'Downloaded:' in output_tail[-1]
- else 0
- )
-
- # Check for common failure cases
- if result.returncode > 0 and files_downloaded < 1:
- hints = (
- 'Got wget response code: {}.'.format(result.returncode),
- *output_tail,
- )
- if b'403: Forbidden' in result.stderr:
- raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
- if b'404: Not Found' in result.stderr:
- raise ArchiveError('404 Not Found', hints)
- if b'ERROR 500: Internal Server Error' in result.stderr:
- raise ArchiveError('500 Internal Server Error', hints)
- raise ArchiveError('Got an error from the server', hints)
- except Exception as err:
- status = 'failed'
- output = err
- finally:
- timer.end()
-
- return ArchiveResult(
- cmd=cmd,
- pwd=out_dir,
- cmd_version=WGET_VERSION,
- output=output,
- status=status,
- **timer.stats,
- )
-
-@enforce_types
-def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
- out_dir = out_dir or link.link_dir
- if is_static_file(link.url):
- return False
-
- if os.path.exists(os.path.join(out_dir, 'output.pdf')):
- return False
-
- return SAVE_PDF
-
-
-@enforce_types
-def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
- """print PDF of site to file using chrome --headless"""
-
- out_dir = out_dir or link.link_dir
- output: ArchiveOutput = 'output.pdf'
- cmd = [
- *chrome_args(TIMEOUT=timeout),
- '--print-to-pdf',
- link.url,
- ]
- status = 'succeeded'
- timer = TimedProgress(timeout, prefix=' ')
- try:
- result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
-
- if result.returncode:
- hints = (result.stderr or result.stdout).decode()
- raise ArchiveError('Failed to save PDF', hints)
-
- chmod_file('output.pdf', cwd=out_dir)
- except Exception as err:
- status = 'failed'
- output = err
- finally:
- timer.end()
-
- return ArchiveResult(
- cmd=cmd,
- pwd=out_dir,
- cmd_version=CHROME_VERSION,
- output=output,
- status=status,
- **timer.stats,
- )
-
-@enforce_types
-def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
- out_dir = out_dir or link.link_dir
- if is_static_file(link.url):
- return False
-
- if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
- return False
-
- return SAVE_SCREENSHOT
-
-@enforce_types
-def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
- """take screenshot of site using chrome --headless"""
-
- out_dir = out_dir or link.link_dir
- output: ArchiveOutput = 'screenshot.png'
- cmd = [
- *chrome_args(TIMEOUT=timeout),
- '--screenshot',
- link.url,
- ]
- status = 'succeeded'
- timer = TimedProgress(timeout, prefix=' ')
- try:
- result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
-
- if result.returncode:
- hints = (result.stderr or result.stdout).decode()
- raise ArchiveError('Failed to save screenshot', hints)
-
- chmod_file(output, cwd=out_dir)
- except Exception as err:
- status = 'failed'
- output = err
- finally:
- timer.end()
-
- return ArchiveResult(
- cmd=cmd,
- pwd=out_dir,
- cmd_version=CHROME_VERSION,
- output=output,
- status=status,
- **timer.stats,
- )
-
-@enforce_types
-def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
- out_dir = out_dir or link.link_dir
- if is_static_file(link.url):
- return False
-
- if os.path.exists(os.path.join(out_dir, 'output.html')):
- return False
-
- return SAVE_DOM
-
-@enforce_types
-def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
- """print HTML of site to file using chrome --dump-html"""
-
- out_dir = out_dir or link.link_dir
- output: ArchiveOutput = 'output.html'
- output_path = os.path.join(out_dir, str(output))
- cmd = [
- *chrome_args(TIMEOUT=timeout),
- '--dump-dom',
- link.url
- ]
- status = 'succeeded'
- timer = TimedProgress(timeout, prefix=' ')
- try:
- with open(output_path, 'w+') as f:
- result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
-
- if result.returncode:
- hints = result.stderr.decode()
- raise ArchiveError('Failed to save DOM', hints)
-
- chmod_file(output, cwd=out_dir)
- except Exception as err:
- status = 'failed'
- output = err
- finally:
- timer.end()
-
- return ArchiveResult(
- cmd=cmd,
- pwd=out_dir,
- cmd_version=CHROME_VERSION,
- output=output,
- status=status,
- **timer.stats,
- )
-
-@enforce_types
-def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
- out_dir = out_dir or link.link_dir
- if is_static_file(link.url):
- return False
-
- if os.path.exists(os.path.join(out_dir, 'git')):
- return False
-
- is_clonable_url = (
- (domain(link.url) in GIT_DOMAINS)
- or (extension(link.url) == 'git')
- )
- if not is_clonable_url:
- return False
-
- return SAVE_GIT
-
-
-@enforce_types
-def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
- """download full site using git"""
-
- out_dir = out_dir or link.link_dir
- output: ArchiveOutput = 'git'
- output_path = os.path.join(out_dir, str(output))
- os.makedirs(output_path, exist_ok=True)
- cmd = [
- GIT_BINARY,
- 'clone',
- '--mirror',
- '--recursive',
- *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
- without_query(without_fragment(link.url)),
- ]
- status = 'succeeded'
- timer = TimedProgress(timeout, prefix=' ')
- try:
- result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
-
- if result.returncode == 128:
- # ignore failed re-download when the folder already exists
- pass
- elif result.returncode > 0:
- hints = 'Got git response code: {}.'.format(result.returncode)
- raise ArchiveError('Failed to save git clone', hints)
-
- except Exception as err:
- status = 'failed'
- output = err
- finally:
- timer.end()
-
- return ArchiveResult(
- cmd=cmd,
- pwd=out_dir,
- cmd_version=GIT_VERSION,
- output=output,
- status=status,
- **timer.stats,
- )
-
-
-@enforce_types
-def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
- out_dir = out_dir or link.link_dir
-
- if is_static_file(link.url):
- return False
-
- if os.path.exists(os.path.join(out_dir, 'media')):
- return False
-
- return SAVE_MEDIA
-
-@enforce_types
-def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
- """Download playlists or individual video, audio, and subtitles using youtube-dl"""
-
- out_dir = out_dir or link.link_dir
- output: ArchiveOutput = 'media'
- output_path = os.path.join(out_dir, str(output))
- os.makedirs(output_path, exist_ok=True)
- cmd = [
- YOUTUBEDL_BINARY,
- '--write-description',
- '--write-info-json',
- '--write-annotations',
- '--yes-playlist',
- '--write-thumbnail',
- '--no-call-home',
- '--no-check-certificate',
- '--user-agent',
- '--all-subs',
- '--extract-audio',
- '--keep-video',
- '--ignore-errors',
- '--geo-bypass',
- '--audio-format', 'mp3',
- '--audio-quality', '320K',
- '--embed-thumbnail',
- '--add-metadata',
- *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
- link.url,
- ]
- status = 'succeeded'
- timer = TimedProgress(timeout, prefix=' ')
- try:
- result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
- chmod_file(output, cwd=out_dir)
- if result.returncode:
- if (b'ERROR: Unsupported URL' in result.stderr
- or b'HTTP Error 404' in result.stderr
- or b'HTTP Error 403' in result.stderr
- or b'URL could be a direct video link' in result.stderr
- or b'Unable to extract container ID' in result.stderr):
- # These happen too frequently on non-media pages to warrant printing to console
- pass
- else:
- hints = (
- 'Got youtube-dl response code: {}.'.format(result.returncode),
- *result.stderr.decode().split('\n'),
- )
- raise ArchiveError('Failed to save media', hints)
- except Exception as err:
- status = 'failed'
- output = err
- finally:
- timer.end()
-
- return ArchiveResult(
- cmd=cmd,
- pwd=out_dir,
- cmd_version=YOUTUBEDL_VERSION,
- output=output,
- status=status,
- **timer.stats,
- )
-
-
-@enforce_types
-def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
- out_dir = out_dir or link.link_dir
- if is_static_file(link.url):
- return False
-
- if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
- # if open(path, 'r').read().strip() != 'None':
- return False
-
- return SAVE_ARCHIVE_DOT_ORG
-
-@enforce_types
-def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
- """submit site to archive.org for archiving via their service, save returned archive url"""
-
- out_dir = out_dir or link.link_dir
- output: ArchiveOutput = 'archive.org.txt'
- archive_org_url = None
- submit_url = 'https://web.archive.org/save/{}'.format(link.url)
- cmd = [
- CURL_BINARY,
- '--location',
- '--head',
- '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
- '--max-time', str(timeout),
- *([] if CHECK_SSL_VALIDITY else ['--insecure']),
- submit_url,
- ]
- status = 'succeeded'
- timer = TimedProgress(timeout, prefix=' ')
- try:
- result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
- content_location, errors = parse_archive_dot_org_response(result.stdout)
- if content_location:
- archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
- elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
- archive_org_url = None
- # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
- elif errors:
- raise ArchiveError(', '.join(errors))
- else:
- raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
- except Exception as err:
- status = 'failed'
- output = err
- finally:
- timer.end()
-
- if output and not isinstance(output, Exception):
- # instead of writing None when archive.org rejects the url write the
- # url to resubmit it to archive.org. This is so when the user visits
- # the URL in person, it will attempt to re-archive it, and it'll show the
- # nicer error message explaining why the url was rejected if it fails.
- archive_org_url = archive_org_url or submit_url
- with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
- f.write(archive_org_url)
- chmod_file('archive.org.txt', cwd=out_dir)
- output = archive_org_url
-
- return ArchiveResult(
- cmd=cmd,
- pwd=out_dir,
- cmd_version=CURL_VERSION,
- output=output,
- status=status,
- **timer.stats,
- )
-
-@enforce_types
-def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
- # Parse archive.org response headers
- headers: Dict[str, List[str]] = defaultdict(list)
-
- # lowercase all the header names and store in dict
- for header in response.splitlines():
- if b':' not in header or not header.strip():
- continue
- name, val = header.decode().split(':', 1)
- headers[name.lower().strip()].append(val.strip())
-
- # Get successful archive url in "content-location" header or any errors
- content_location = headers['content-location']
- errors = headers['x-archive-wayback-runtime-error']
- return content_location, errors
diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py
deleted file mode 100644
index 4095fa24..00000000
--- a/archivebox/legacy/main.py
+++ /dev/null
@@ -1,626 +0,0 @@
-import os
-import re
-import shutil
-
-from typing import Dict, List, Optional, Iterable
-from itertools import chain
-
-from .schema import Link
-from .util import (
- enforce_types,
- TimedProgress,
- get_dir_size,
- human_readable_size,
-)
-from .index import (
- links_after_timestamp,
- load_main_index,
- import_new_links,
- write_main_index,
-)
-from .storage.json import (
- parse_json_main_index,
- parse_json_link_details,
- parse_json_links_details,
-)
-from .storage.sql import parse_sql_main_index, get_admins
-from .storage.html import parse_html_main_index
-from .archive_methods import archive_link
-from .config import (
- stderr,
- ANSI,
- ONLY_NEW,
- OUTPUT_DIR,
- SOURCES_DIR,
- ARCHIVE_DIR,
- LOGS_DIR,
- CONFIG_FILE,
- ARCHIVE_DIR_NAME,
- SOURCES_DIR_NAME,
- LOGS_DIR_NAME,
- STATIC_DIR_NAME,
- JSON_INDEX_FILENAME,
- HTML_INDEX_FILENAME,
- SQL_INDEX_FILENAME,
- ROBOTS_TXT_FILENAME,
- FAVICON_FILENAME,
- check_dependencies,
- check_data_folder,
- setup_django,
- write_config_file,
-)
-from .logs import (
- log_archiving_started,
- log_archiving_paused,
- log_archiving_finished,
- log_removal_started,
- log_removal_finished,
- log_list_started,
- log_list_finished,
-)
-
-
-ALLOWED_IN_OUTPUT_DIR = {
- '.DS_Store',
- '.venv',
- 'venv',
- 'virtualenv',
- '.virtualenv',
- ARCHIVE_DIR_NAME,
- SOURCES_DIR_NAME,
- LOGS_DIR_NAME,
- STATIC_DIR_NAME,
- SQL_INDEX_FILENAME,
- JSON_INDEX_FILENAME,
- HTML_INDEX_FILENAME,
- ROBOTS_TXT_FILENAME,
- FAVICON_FILENAME,
-}
-
-
-@enforce_types
-def init():
- os.makedirs(OUTPUT_DIR, exist_ok=True)
-
- is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR)
- existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
-
- if is_empty and not existing_index:
- print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
- print(f' {OUTPUT_DIR}')
- print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
- elif existing_index:
- print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
- print(f' {OUTPUT_DIR}')
- print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
- else:
- stderr(
- ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
- " You must run init in a completely empty directory, or an existing data folder.\n\n"
- " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
- " then run and run 'archivebox init' to pick up where you left off.\n\n"
- " (Always make sure your data folder is backed up first before updating ArchiveBox)"
- ).format(OUTPUT_DIR, **ANSI)
- )
- raise SystemExit(1)
-
- if existing_index:
- print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
- else:
- print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
-
- os.makedirs(SOURCES_DIR, exist_ok=True)
- print(f' β {SOURCES_DIR}')
-
- os.makedirs(ARCHIVE_DIR, exist_ok=True)
- print(f' β {ARCHIVE_DIR}')
-
- os.makedirs(LOGS_DIR, exist_ok=True)
- print(f' β {LOGS_DIR}')
-
- write_config_file({}, out_dir=OUTPUT_DIR)
- print(f' β {CONFIG_FILE}')
-
- if os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)):
- print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
- else:
- print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
-
- setup_django(OUTPUT_DIR, check_db=False)
- from django.conf import settings
- assert settings.DATABASE_FILE == os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)
- print(f' β {settings.DATABASE_FILE}')
- print()
- from .storage.sql import apply_migrations
- for migration_line in apply_migrations(OUTPUT_DIR):
- print(f' {migration_line}')
-
-
- assert os.path.exists(settings.DATABASE_FILE)
-
- # from django.contrib.auth.models import User
- # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
- # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
- # call_command("createsuperuser", interactive=True)
-
- print()
- print('{green}[*] Collecting links from any existing index or archive folders...{reset}'.format(**ANSI))
-
- all_links = {}
- if existing_index:
- all_links = {
- link.url: link
- for link in load_main_index(out_dir=OUTPUT_DIR, warn=False)
- }
- print(' β Loaded {} links from existing main index...'.format(len(all_links)))
-
- orphaned_json_links = {
- link.url: link
- for link in parse_json_main_index(OUTPUT_DIR)
- if link.url not in all_links
- }
- if orphaned_json_links:
- all_links.update(orphaned_json_links)
- print(' {lightyellow}β Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
-
- orphaned_sql_links = {
- link.url: link
- for link in parse_sql_main_index(OUTPUT_DIR)
- if link.url not in all_links
- }
- if orphaned_sql_links:
- all_links.update(orphaned_sql_links)
- print(' {lightyellow}β Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
-
- orphaned_data_dir_links = {
- link.url: link
- for link in parse_json_links_details(OUTPUT_DIR)
- }
- orphan_new_links = {
- url: link
- for url, link in orphaned_data_dir_links.items()
- if url not in all_links
- }
- orphan_duplicates = {
- url: link
- for url, link in orphaned_data_dir_links.items()
- if url in all_links
- }
- if orphan_new_links:
- all_links.update(orphan_new_links)
- print(' {lightyellow}β Added {} orphaned links from existing archive directories...{reset}'.format(len(orphan_new_links), **ANSI))
- if orphan_duplicates:
- print(' {lightyellow}! Skipped adding {} invalid link data directories that would have overwritten or corrupted existing data.{reset}'.format(len(orphan_duplicates), **ANSI))
-
- orphaned_data_dirs = {folder for folder in orphan_duplicates.keys()}
- invalid_folders = {
- folder: link
- for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items()
- if folder not in orphaned_data_dirs
- }
- if invalid_folders:
- print(' {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI))
-
- if orphan_duplicates or invalid_folders:
- print(' For more information about the link data directories that were skipped, run:')
- print(' archivebox info')
- print(' archivebox list --status=invalid')
- print(' archivebox list --status=orphaned')
- print(' archivebox list --status=duplicate')
-
-
- write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR)
-
- print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
- if existing_index:
- print('{green}[β] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
- else:
- print('{green}[β] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
- print()
- print(' To view your archive index, open:')
- print(' {}'.format(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME)))
- print()
- print(' To add new links, you can run:')
- print(" archivebox add 'https://example.com'")
- print()
- print(' For more usage and examples, run:')
- print(' archivebox help')
-
-
-@enforce_types
-def info():
-
- print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
- print(f' {OUTPUT_DIR}/*')
- num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False, pattern='index.')
- size = human_readable_size(num_bytes)
- print(f' Size: {size} across {num_files} files')
- print()
-
- links = list(load_main_index(out_dir=OUTPUT_DIR))
- num_json_links = len(links)
- num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=OUTPUT_DIR))
- num_html_links = sum(1 for url in parse_html_main_index(out_dir=OUTPUT_DIR))
- num_link_details = sum(1 for link in parse_json_links_details(out_dir=OUTPUT_DIR))
- users = get_admins().values_list('username', flat=True)
- print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})')
- print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
- print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
- print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
-
- print(f' > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
-
- if num_html_links != len(links) or num_sql_links != len(links):
- print()
- print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
- print(' archivebox init')
-
- if not users:
- print()
- print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
- print(' archivebox manage createsuperuser')
-
- print()
- print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI))
- print(f' {ARCHIVE_DIR}/*')
-
- num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
- size = human_readable_size(num_bytes)
- print(f' Size: {size} across {num_files} files in {num_dirs} directories')
- print()
-
- num_indexed = len(get_indexed_folders(links, out_dir=OUTPUT_DIR))
- num_archived = len(get_archived_folders(links, out_dir=OUTPUT_DIR))
- num_unarchived = len(get_unarchived_folders(links, out_dir=OUTPUT_DIR))
- print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
- print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
- print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
-
- num_present = len(get_present_folders(links, out_dir=OUTPUT_DIR))
- num_valid = len(get_valid_folders(links, out_dir=OUTPUT_DIR))
- print()
- print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
- print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
-
- duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
- orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
- corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
- unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
- num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
- print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
- print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
- print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
- print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
- print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
-
- if num_indexed:
- print()
- print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
- print(' archivebox list --status= (e.g. indexed, corrupted, archived, etc.)')
-
- if orphaned:
- print()
- print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
- print(' archivebox init')
-
- if num_invalid:
- print()
- print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
- print(' archivebox init')
-
- print()
-
-
-
-@enforce_types
-def update_archive_data(import_path: Optional[str]=None,
- resume: Optional[float]=None,
- only_new: bool=False,
- index_only: bool=False) -> List[Link]:
- """The main ArchiveBox entrancepoint. Everything starts here."""
-
- check_dependencies()
- check_data_folder()
-
- # Step 1: Load list of links from the existing index
- # merge in and dedupe new links from import_path
- all_links: List[Link] = []
- new_links: List[Link] = []
- all_links = load_main_index(out_dir=OUTPUT_DIR)
- if import_path:
- all_links, new_links = import_new_links(all_links, import_path)
-
- # Step 2: Write updated index with deduped old and new links back to disk
- write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
-
- if index_only:
- return all_links
-
- # Step 3: Run the archive methods for each link
- links = new_links if ONLY_NEW else all_links
- log_archiving_started(len(links), resume)
- idx: int = 0
- link: Link = None # type: ignore
- try:
- for idx, link in enumerate(links_after_timestamp(links, resume)):
- archive_link(link, out_dir=link.link_dir)
-
- except KeyboardInterrupt:
- log_archiving_paused(len(links), idx, link.timestamp if link else '0')
- raise SystemExit(0)
-
- except:
- print()
- raise
-
- log_archiving_finished(len(links))
-
- # Step 4: Re-write links index with updated titles, icons, and resources
- all_links = load_main_index(out_dir=OUTPUT_DIR)
- write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
- return all_links
-
-
-LINK_FILTERS = {
- 'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
- 'substring': lambda link, pattern: pattern in link.url,
- 'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
- 'domain': lambda link, pattern: link.domain == pattern,
-}
-
-@enforce_types
-def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
- for pattern in filter_patterns:
- if LINK_FILTERS[filter_type](link, pattern):
- return True
-
- return False
-
-
-@enforce_types
-def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
- after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
-
- all_links = load_main_index(out_dir=OUTPUT_DIR)
-
- for link in all_links:
- if after is not None and float(link.timestamp) < after:
- continue
- if before is not None and float(link.timestamp) > before:
- continue
-
- if filter_patterns:
- if link_matches_filter(link, filter_patterns, filter_type):
- yield link
- else:
- yield link
-
-
-@enforce_types
-def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
- after: Optional[float]=None, before: Optional[float]=None,
- yes: bool=False, delete: bool=False) -> List[Link]:
-
- check_dependencies()
- check_data_folder()
-
- log_list_started(filter_patterns, filter_type)
- timer = TimedProgress(360, prefix=' ')
- try:
- links = list(list_archive_data(
- filter_patterns=filter_patterns,
- filter_type=filter_type,
- after=after,
- before=before,
- ))
- finally:
- timer.end()
-
- if not len(links):
- log_removal_finished(0, 0)
- raise SystemExit(1)
-
-
- log_list_finished(links)
- log_removal_started(links, yes=yes, delete=delete)
-
- timer = TimedProgress(360, prefix=' ')
- try:
- to_keep = []
- all_links = load_main_index(out_dir=OUTPUT_DIR)
- for link in all_links:
- should_remove = (
- (after is not None and float(link.timestamp) < after)
- or (before is not None and float(link.timestamp) > before)
- or link_matches_filter(link, filter_patterns, filter_type)
- )
- if not should_remove:
- to_keep.append(link)
- elif should_remove and delete:
- shutil.rmtree(link.link_dir)
- finally:
- timer.end()
-
- write_main_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
- log_removal_finished(len(all_links), len(to_keep))
-
- return to_keep
-
-
-
-def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
- """indexed links without checking archive status or data directory validity"""
- return {
- link.link_dir: link
- for link in links
- }
-
-def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
- """indexed links that are archived with a valid data directory"""
- return {
- link.link_dir: link
- for link in filter(is_archived, links)
- }
-
-def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
- """indexed links that are unarchived with no data directory or an empty data directory"""
- return {
- link.link_dir: link
- for link in filter(is_unarchived, links)
- }
-
-def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
- """dirs that are expected to exist based on the main index"""
- all_folders = {}
-
- for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
- if entry.is_dir(follow_symlinks=True):
- link = None
- try:
- link = parse_json_link_details(entry.path)
- except Exception:
- pass
-
- all_folders[entry.path] = link
-
- return all_folders
-
-def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
- """dirs with a valid index matched to the main index and archived content"""
- return {
- link.link_dir: link
- for link in filter(is_valid, links)
- }
-
-def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
- """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
- duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
- orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
- corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
- unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
- return {**duplicate, **orphaned, **corrupted, **unrecognized}
-
-
-def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
- """dirs that conflict with other directories that have the same link URL or timestamp"""
- links = list(links)
- by_url = {link.url: 0 for link in links}
- by_timestamp = {link.timestamp: 0 for link in links}
-
- duplicate_folders = {}
-
- indexed_folders = {link.link_dir for link in links}
- data_folders = (
- entry.path
- for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
- if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
- )
-
- for path in chain(sorted(indexed_folders), sorted(data_folders)):
- link = None
- try:
- link = parse_json_link_details(path)
- except Exception:
- pass
-
- if link:
- # link folder has same timestamp as different link folder
- by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
- if by_timestamp[link.timestamp] > 1:
- duplicate_folders[path] = link
-
- # link folder has same url as different link folder
- by_url[link.url] = by_url.get(link.url, 0) + 1
- if by_url[link.url] > 1:
- duplicate_folders[path] = link
-
- return duplicate_folders
-
-def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
- """dirs that contain a valid index but aren't listed in the main index"""
- links = list(links)
- indexed_folders = {link.link_dir: link for link in links}
- orphaned_folders = {}
-
- for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
- if entry.is_dir(follow_symlinks=True):
- index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
- link = None
- try:
- link = parse_json_link_details(entry.path)
- except Exception:
- pass
-
- if index_exists and entry.path not in indexed_folders:
- # folder is a valid link data dir with index details, but it's not in the main index
- orphaned_folders[entry.path] = link
-
- return orphaned_folders
-
-def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
- """dirs that don't contain a valid index and aren't listed in the main index"""
- return {
- link.link_dir: link
- for link in filter(is_corrupt, links)
- }
-
-def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
- """dirs that don't contain recognizable archive data and aren't listed in the main index"""
- by_timestamp = {link.timestamp: 0 for link in links}
- unrecognized_folders: Dict[str, Optional[Link]] = {}
-
- for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
- if entry.is_dir(follow_symlinks=True):
- index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
- link = None
- try:
- link = parse_json_link_details(entry.path)
- except Exception:
- pass
-
- if index_exists and link is None:
- # index exists but it's corrupted or unparseable
- unrecognized_folders[entry.path] = link
-
- elif not index_exists:
- # link details index doesn't exist and the folder isn't in the main index
- timestamp = entry.path.rsplit('/', 1)[-1]
- if timestamp not in by_timestamp:
- unrecognized_folders[entry.path] = link
-
- return unrecognized_folders
-
-
-def is_valid(link: Link) -> bool:
- dir_exists = os.path.exists(link.link_dir)
- index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
- if not dir_exists:
- # unarchived links are not included in the valid list
- return False
- if dir_exists and not index_exists:
- return False
- if dir_exists and index_exists:
- try:
- parsed_link = parse_json_link_details(link.link_dir)
- return link.url == parsed_link.url
- except Exception:
- pass
- return False
-
-def is_corrupt(link: Link) -> bool:
- if not os.path.exists(link.link_dir):
- # unarchived links are not considered corrupt
- return False
-
- if is_valid(link):
- return False
-
- return True
-
-def is_archived(link: Link) -> bool:
- return is_valid(link) and link.is_archived
-
-def is_unarchived(link: Link) -> bool:
- if not os.path.exists(link.link_dir):
- return True
- return not link.is_archived
diff --git a/archivebox/legacy/mypy_django.ini b/archivebox/legacy/mypy_django.ini
deleted file mode 100644
index 306e567c..00000000
--- a/archivebox/legacy/mypy_django.ini
+++ /dev/null
@@ -1,10 +0,0 @@
-[mypy_django_plugin]
-
-# specify settings module to use for django.conf.settings, this setting
-# could also be specified with DJANGO_SETTINGS_MODULE environment variable
-# (it also takes priority over config file)
-django_settings = core.settings
-
-# if True, all unknown settings in django.conf.settings will fallback to Any,
-# specify it if your settings are loaded dynamically to avoid false positives
-ignore_missing_settings = True
diff --git a/archivebox/legacy/parse.py b/archivebox/legacy/parse.py
deleted file mode 100644
index 49ffa7fd..00000000
--- a/archivebox/legacy/parse.py
+++ /dev/null
@@ -1,331 +0,0 @@
-"""
-Everything related to parsing links from input sources.
-
-For a list of supported services, see the README.md.
-For examples of supported import formats see tests/.
-
-Link: {
- 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
- 'timestamp': '1544212312.4234',
- 'title': 'Example.com Page Title',
- 'tags': 'abc,def',
- 'sources': [
- 'output/sources/ril_export.html',
- 'output/sources/getpocket.com-1523422111.txt',
- 'output/sources/stdin-234234112312.txt'
- ]
-}
-"""
-
-import re
-import json
-
-from typing import Tuple, List, IO, Iterable
-from datetime import datetime
-import xml.etree.ElementTree as etree
-
-from .config import TIMEOUT
-from .util import (
- htmldecode,
- str_between,
- URL_REGEX,
- check_url_parsing_invariants,
- TimedProgress,
- Link,
- enforce_types,
-)
-
-
-@enforce_types
-def parse_links(source_file: str) -> Tuple[List[Link], str]:
- """parse a list of URLs with their metadata from an
- RSS feed, bookmarks export, or text file
- """
-
- check_url_parsing_invariants()
- PARSERS = (
- # Specialized parsers
- ('Pocket HTML', parse_pocket_html_export),
- ('Pinboard RSS', parse_pinboard_rss_export),
- ('Shaarli RSS', parse_shaarli_rss_export),
- ('Medium RSS', parse_medium_rss_export),
-
- # General parsers
- ('Netscape HTML', parse_netscape_html_export),
- ('Generic RSS', parse_rss_export),
- ('Generic JSON', parse_json_export),
-
- # Fallback parser
- ('Plain Text', parse_plain_text_export),
- )
- timer = TimedProgress(TIMEOUT * 4)
- with open(source_file, 'r', encoding='utf-8') as file:
- for parser_name, parser_func in PARSERS:
- try:
- links = list(parser_func(file))
- if links:
- timer.end()
- return links, parser_name
- except Exception as err: # noqa
- # Parsers are tried one by one down the list, and the first one
- # that succeeds is used. To see why a certain parser was not used
- # due to error or format incompatibility, uncomment this line:
- # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
- pass
-
- timer.end()
- return [], 'Failed to parse'
-
-
-### Import Parser Functions
-
-@enforce_types
-def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
- """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
-
- html_file.seek(0)
- pattern = re.compile("^\\s*(.+)", re.UNICODE)
- for line in html_file:
- # example line
- # example title
- match = pattern.search(line)
- if match:
- url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
- time = datetime.fromtimestamp(float(match.group(2)))
- tags = match.group(3)
- title = match.group(4).replace(' β Readability', '').replace('http://www.readability.com/read?url=', '')
-
- yield Link(
- url=htmldecode(url),
- timestamp=str(time.timestamp()),
- title=htmldecode(title) or None,
- tags=tags or '',
- sources=[html_file.name],
- )
-
-
-@enforce_types
-def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
- """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
-
- json_file.seek(0)
- links = json.load(json_file)
- json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
-
- for link in links:
- # example line
- # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
- if link:
- # Parse URL
- url = link.get('href') or link.get('url') or link.get('URL')
- if not url:
- raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
-
- # Parse the timestamp
- ts_str = str(datetime.now().timestamp())
- if link.get('timestamp'):
- # chrome/ff histories use a very precise timestamp
- ts_str = str(link['timestamp'] / 10000000)
- elif link.get('time'):
- ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
- elif link.get('created_at'):
- ts_str = str(json_date(link['created_at']).timestamp())
- elif link.get('created'):
- ts_str = str(json_date(link['created']).timestamp())
- elif link.get('date'):
- ts_str = str(json_date(link['date']).timestamp())
- elif link.get('bookmarked'):
- ts_str = str(json_date(link['bookmarked']).timestamp())
- elif link.get('saved'):
- ts_str = str(json_date(link['saved']).timestamp())
-
- # Parse the title
- title = None
- if link.get('title'):
- title = link['title'].strip()
- elif link.get('description'):
- title = link['description'].replace(' β Readability', '').strip()
- elif link.get('name'):
- title = link['name'].strip()
-
- yield Link(
- url=htmldecode(url),
- timestamp=ts_str,
- title=htmldecode(title) or None,
- tags=htmldecode(link.get('tags')) or '',
- sources=[json_file.name],
- )
-
-
-@enforce_types
-def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
- """Parse RSS XML-format files into links"""
-
- rss_file.seek(0)
- items = rss_file.read().split('- ')
- items = items[1:] if items else []
- for item in items:
- # example item:
- #
-
- #
- # Unread
- # https://blog.sessionstack.com/how-javascript-works-inside
- # https://blog.sessionstack.com/how-javascript-works-inside
- # Mon, 21 Aug 2017 14:21:58 -0500
- #
-
- trailing_removed = item.split(' ', 1)[0]
- leading_removed = trailing_removed.split('- ', 1)[-1].strip()
- rows = leading_removed.split('\n')
-
- def get_row(key):
- return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
-
- url = str_between(get_row('link'), '', '')
- ts_str = str_between(get_row('pubDate'), '', '')
- time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
- title = str_between(get_row('title'), ' Iterable[Link]:
- """Parse Shaarli-specific RSS XML-format files into links"""
-
- rss_file.seek(0)
- entries = rss_file.read().split('')[1:]
- for entry in entries:
- # example entry:
- #
- # Aktuelle Trojaner-Welle: Emotet lauert in gefΓΒ€lschten Rechnungsmails | heise online
- #
- # https://demo.shaarli.org/?cEV4vw
- # 2019-01-30T06:06:01+00:00
- # 2019-01-30T06:06:01+00:00
- #
— Permalink
]]>
- #
-
- trailing_removed = entry.split('', 1)[0]
- leading_removed = trailing_removed.strip()
- rows = leading_removed.split('\n')
-
- def get_row(key):
- return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
-
- title = str_between(get_row('title'), '', '').strip()
- url = str_between(get_row('link'), '')
- ts_str = str_between(get_row('published'), '', '')
- time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
-
- yield Link(
- url=htmldecode(url),
- timestamp=str(time.timestamp()),
- title=htmldecode(title) or None,
- tags=None,
- sources=[rss_file.name],
- )
-
-
-@enforce_types
-def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
- """Parse netscape-format bookmarks export files (produced by all browsers)"""
-
- html_file.seek(0)
- pattern = re.compile("]*>(.+)", re.UNICODE | re.IGNORECASE)
- for line in html_file:
- # example line
- # example bookmark title
-
- match = pattern.search(line)
- if match:
- url = match.group(1)
- time = datetime.fromtimestamp(float(match.group(2)))
- title = match.group(3).strip()
-
- yield Link(
- url=htmldecode(url),
- timestamp=str(time.timestamp()),
- title=htmldecode(title) or None,
- tags=None,
- sources=[html_file.name],
- )
-
-
-@enforce_types
-def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
- """Parse Pinboard RSS feed files into links"""
-
- rss_file.seek(0)
- root = etree.parse(rss_file).getroot()
- items = root.findall("{http://purl.org/rss/1.0/}item")
- for item in items:
- find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore
-
- url = find("{http://purl.org/rss/1.0/}link")
- tags = find("{http://purl.org/dc/elements/1.1/}subject")
- title = find("{http://purl.org/rss/1.0/}title")
- ts_str = find("{http://purl.org/dc/elements/1.1/}date")
-
- # Pinboard includes a colon in its date stamp timezone offsets, which
- # Python can't parse. Remove it:
- if ts_str and ts_str[-3:-2] == ":":
- ts_str = ts_str[:-3]+ts_str[-2:]
-
- if ts_str:
- time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
- else:
- time = datetime.now()
-
- yield Link(
- url=htmldecode(url),
- timestamp=str(time.timestamp()),
- title=htmldecode(title) or None,
- tags=htmldecode(tags) or None,
- sources=[rss_file.name],
- )
-
-
-@enforce_types
-def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
- """Parse Medium RSS feed files into links"""
-
- rss_file.seek(0)
- root = etree.parse(rss_file).getroot()
- items = root.find("channel").findall("item") # type: ignore
- for item in items:
- url = item.find("link").text # type: ignore
- title = item.find("title").text.strip() # type: ignore
- ts_str = item.find("pubDate").text # type: ignore
- time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
-
- yield Link(
- url=htmldecode(url),
- timestamp=str(time.timestamp()),
- title=htmldecode(title) or None,
- tags=None,
- sources=[rss_file.name],
- )
-
-
-@enforce_types
-def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
- """Parse raw links from each line in a text file"""
-
- text_file.seek(0)
- for line in text_file.readlines():
- urls = re.findall(URL_REGEX, line) if line.strip() else ()
- for url in urls: # type: ignore
- yield Link(
- url=htmldecode(url),
- timestamp=str(datetime.now().timestamp()),
- title=None,
- tags=None,
- sources=[text_file.name],
- )
diff --git a/archivebox/legacy/purge.py b/archivebox/legacy/purge.py
deleted file mode 100755
index b36083f0..00000000
--- a/archivebox/legacy/purge.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/env python3
-
-import re
-from argparse import ArgumentParser
-from os.path import exists, join
-from shutil import rmtree
-from typing import List
-
-from .config import ARCHIVE_DIR, OUTPUT_DIR
-from .index import (
- parse_json_links_index,
- write_html_links_index,
- write_json_links_index,
-)
-
-
-def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
- if not exists(join(OUTPUT_DIR, 'index.json')):
- exit('index.json is missing; nothing to do')
-
- compiled = [re.compile(r) for r in regexes]
- links = parse_json_links_index(OUTPUT_DIR)
- filtered = []
- remaining = []
-
- for link in links:
- url = link.url
- for r in compiled:
- if r.search(url):
- filtered.append((link, r))
- break
- else:
- remaining.append(link)
-
- if not filtered:
- exit('Search did not match any entries.')
-
- print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
-
- for link, regex in filtered:
- url = link.url
- print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
-
- if not proceed:
- answer = input('Remove {} entries from index? [y/n] '.format(
- len(filtered)))
- proceed = answer.strip().lower() in ('y', 'yes')
-
- if not proceed:
- exit('Aborted')
-
- write_json_links_index(OUTPUT_DIR, remaining)
- write_html_links_index(OUTPUT_DIR, remaining)
-
- if delete:
- for link, _ in filtered:
- data_dir = join(ARCHIVE_DIR, link['timestamp'])
- if exists(data_dir):
- rmtree(data_dir)
-
-
-if __name__ == '__main__':
- p = ArgumentParser('Index purging tool')
- p.add_argument(
- '--regex',
- '-r',
- action='append',
- help='Regular expression matching URLs to purge',
- )
- p.add_argument(
- '--delete',
- '-d',
- action='store_true',
- default=False,
- help='Delete webpage files from archive',
- )
- p.add_argument(
- '--yes',
- '-y',
- action='store_true',
- default=False,
- help='Do not prompt for confirmation',
- )
-
- args = p.parse_args()
- if args.regex:
- cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
- else:
- p.print_help()
diff --git a/archivebox/legacy/storage/__init__.py b/archivebox/legacy/storage/__init__.py
deleted file mode 100644
index 40c7f113..00000000
--- a/archivebox/legacy/storage/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-__package__ = 'archivebox.legacy.storage'
diff --git a/archivebox/main.py b/archivebox/main.py
new file mode 100644
index 00000000..501f6efd
--- /dev/null
+++ b/archivebox/main.py
@@ -0,0 +1,1086 @@
+__package__ = 'archivebox'
+
+import re
+import os
+import sys
+import shutil
+
+from typing import Dict, List, Optional, Set, Tuple, Iterable, IO
+
+from crontab import CronTab, CronSlices
+
+from .cli import (
+ list_subcommands,
+ run_subcommand,
+ display_first,
+ meta_cmds,
+ main_cmds,
+ archive_cmds,
+)
+from .index.schema import Link
+from .util import (
+ enforce_types,
+ TimedProgress,
+ get_dir_size,
+ human_readable_size,
+ save_stdin_to_sources,
+ save_file_to_sources,
+ links_to_csv,
+ to_json,
+ folders_to_str,
+)
+from .index import (
+ links_after_timestamp,
+ load_main_index,
+ import_new_links,
+ write_main_index,
+ link_matches_filter,
+ get_indexed_folders,
+ get_archived_folders,
+ get_unarchived_folders,
+ get_present_folders,
+ get_valid_folders,
+ get_invalid_folders,
+ get_duplicate_folders,
+ get_orphaned_folders,
+ get_corrupted_folders,
+ get_unrecognized_folders,
+ fix_invalid_folder_locations,
+)
+from .index.json import (
+ parse_json_main_index,
+ parse_json_links_details,
+)
+from .index.sql import parse_sql_main_index, get_admins, apply_migrations
+from .index.html import parse_html_main_index
+from .extractors import archive_link
+from .config import (
+ stderr,
+ ConfigDict,
+ ANSI,
+ IS_TTY,
+ USER,
+ ARCHIVEBOX_BINARY,
+ ONLY_NEW,
+ OUTPUT_DIR,
+ SOURCES_DIR,
+ ARCHIVE_DIR,
+ LOGS_DIR,
+ CONFIG_FILE,
+ ARCHIVE_DIR_NAME,
+ SOURCES_DIR_NAME,
+ LOGS_DIR_NAME,
+ STATIC_DIR_NAME,
+ JSON_INDEX_FILENAME,
+ HTML_INDEX_FILENAME,
+ SQL_INDEX_FILENAME,
+ ROBOTS_TXT_FILENAME,
+ FAVICON_FILENAME,
+ check_dependencies,
+ check_data_folder,
+ write_config_file,
+ setup_django,
+ VERSION,
+ CODE_LOCATIONS,
+ EXTERNAL_LOCATIONS,
+ DATA_LOCATIONS,
+ DEPENDENCIES,
+ load_all_config,
+ CONFIG,
+ USER_CONFIG,
+ get_real_name,
+)
+from .cli.logging import (
+ log_archiving_started,
+ log_archiving_paused,
+ log_archiving_finished,
+ log_removal_started,
+ log_removal_finished,
+ log_list_started,
+ log_list_finished,
+)
+
+
+ALLOWED_IN_OUTPUT_DIR = {
+ '.DS_Store',
+ '.venv',
+ 'venv',
+ 'virtualenv',
+ '.virtualenv',
+ ARCHIVE_DIR_NAME,
+ SOURCES_DIR_NAME,
+ LOGS_DIR_NAME,
+ STATIC_DIR_NAME,
+ SQL_INDEX_FILENAME,
+ JSON_INDEX_FILENAME,
+ HTML_INDEX_FILENAME,
+ ROBOTS_TXT_FILENAME,
+ FAVICON_FILENAME,
+}
+
+def help(out_dir: str=OUTPUT_DIR) -> None:
+ all_subcommands = list_subcommands()
+ COMMANDS_HELP_TEXT = '\n '.join(
+ f'{cmd.ljust(20)} {summary}'
+ for cmd, summary in all_subcommands.items()
+ if cmd in meta_cmds
+ ) + '\n\n ' + '\n '.join(
+ f'{cmd.ljust(20)} {summary}'
+ for cmd, summary in all_subcommands.items()
+ if cmd in main_cmds
+ ) + '\n\n ' + '\n '.join(
+ f'{cmd.ljust(20)} {summary}'
+ for cmd, summary in all_subcommands.items()
+ if cmd in archive_cmds
+ ) + '\n\n ' + '\n '.join(
+ f'{cmd.ljust(20)} {summary}'
+ for cmd, summary in all_subcommands.items()
+ if cmd not in display_first
+ )
+
+
+ if os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME)):
+ print('''{green}ArchiveBox v{}: The self-hosted internet archive.{reset}
+
+{lightred}Active data directory:{reset}
+ {}
+
+{lightred}Usage:{reset}
+ archivebox [command] [--help] [--version] [...args]
+
+{lightred}Commands:{reset}
+ {}
+
+{lightred}Example Use:{reset}
+ mkdir my-archive; cd my-archive/
+ archivebox init
+ archivebox info
+
+ archivebox add https://example.com/some/page
+ archivebox add --depth=1 ~/Downloads/bookmarks_export.html
+
+ archivebox list --sort=timestamp --csv=timestamp,url,is_archived
+ archivebox schedule --every=week https://example.com/some/feed.rss
+ archivebox update --resume=15109948213.123
+
+{lightred}Documentation:{reset}
+ https://github.com/pirate/ArchiveBox/wiki
+'''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
+
+ else:
+ print('{green}Welcome to ArchiveBox v{}!{reset}'.format(VERSION, **ANSI))
+ print()
+ print('To import an existing archive (from a previous version of ArchiveBox):')
+ print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
+ print(' 2. archivebox init')
+ print()
+ print('To start a new archive:')
+ print(' 1. Create an empty directory, then cd into it and run:')
+ print(' 2. archivebox init')
+ print()
+ print('For more information, see the documentation here:')
+ print(' https://github.com/pirate/ArchiveBox/wiki')
+
+
+def version(quiet: bool=False, out_dir: str=OUTPUT_DIR) -> None:
+ if quiet:
+ print(VERSION)
+ else:
+ print('ArchiveBox v{}'.format(VERSION))
+ print()
+
+ print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
+ for name, dependency in DEPENDENCIES.items():
+ print_dependency_version(name, dependency)
+
+ print()
+ print('{white}[i] Code locations:{reset}'.format(**ANSI))
+ for name, folder in CODE_LOCATIONS.items():
+ print_folder_status(name, folder)
+
+ print()
+ print('{white}[i] External locations:{reset}'.format(**ANSI))
+ for name, folder in EXTERNAL_LOCATIONS.items():
+ print_folder_status(name, folder)
+
+ print()
+ print('{white}[i] Data locations:{reset}'.format(**ANSI))
+ for name, folder in DATA_LOCATIONS.items():
+ print_folder_status(name, folder)
+
+ print()
+ check_dependencies()
+
+
+def run(subcommand: str, subcommand_args: Optional[List[str]], stdin: Optional[IO]=None, out_dir: str=OUTPUT_DIR) -> None:
+ run_subcommand(
+ subcommand=subcommand,
+ subcommand_args=subcommand_args,
+ stdin=stdin,
+ out_dir=out_dir,
+ )
+
+
+def init(out_dir: str=OUTPUT_DIR) -> None:
+ os.makedirs(out_dir, exist_ok=True)
+
+ is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
+ existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME))
+
+ if is_empty and not existing_index:
+ print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
+ print(f' {out_dir}')
+ print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+ elif existing_index:
+ print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
+ print(f' {out_dir}')
+ print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+ else:
+ stderr(
+ ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
+ " You must run init in a completely empty directory, or an existing data folder.\n\n"
+ " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
+ " then run and run 'archivebox init' to pick up where you left off.\n\n"
+ " (Always make sure your data folder is backed up first before updating ArchiveBox)"
+ ).format(out_dir, **ANSI)
+ )
+ raise SystemExit(1)
+
+ if existing_index:
+ print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
+ else:
+ print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
+
+ os.makedirs(SOURCES_DIR, exist_ok=True)
+ print(f' β {SOURCES_DIR}')
+
+ os.makedirs(ARCHIVE_DIR, exist_ok=True)
+ print(f' β {ARCHIVE_DIR}')
+
+ os.makedirs(LOGS_DIR, exist_ok=True)
+ print(f' β {LOGS_DIR}')
+
+ write_config_file({}, out_dir=out_dir)
+ print(f' β {CONFIG_FILE}')
+
+ if os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)):
+ print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
+ else:
+ print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
+
+ setup_django(out_dir, check_db=False)
+ from django.conf import settings
+ assert settings.DATABASE_FILE == os.path.join(out_dir, SQL_INDEX_FILENAME)
+ print(f' β {settings.DATABASE_FILE}')
+ print()
+ for migration_line in apply_migrations(out_dir):
+ print(f' {migration_line}')
+
+
+ assert os.path.exists(settings.DATABASE_FILE)
+
+ # from django.contrib.auth.models import User
+ # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
+ # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
+ # call_command("createsuperuser", interactive=True)
+
+ print()
+ print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
+
+ all_links: Dict[str, Link] = {}
+ if existing_index:
+ all_links = {
+ link.url: link
+ for link in load_main_index(out_dir=out_dir, warn=False)
+ }
+ print(' β Loaded {} links from existing main index.'.format(len(all_links)))
+
+ # Links in data folders that dont match their timestamp
+ fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
+ if fixed:
+ print(' {lightyellow}β Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
+ if cant_fix:
+ print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
+
+ # Links in JSON index but not in main index
+ orphaned_json_links = {
+ link.url: link
+ for link in parse_json_main_index(out_dir)
+ if link.url not in all_links
+ }
+ if orphaned_json_links:
+ all_links.update(orphaned_json_links)
+ print(' {lightyellow}β Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
+
+ # Links in SQL index but not in main index
+ orphaned_sql_links = {
+ link.url: link
+ for link in parse_sql_main_index(out_dir)
+ if link.url not in all_links
+ }
+ if orphaned_sql_links:
+ all_links.update(orphaned_sql_links)
+ print(' {lightyellow}β Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
+
+ # Links in data dir indexes but not in main index
+ orphaned_data_dir_links = {
+ link.url: link
+ for link in parse_json_links_details(out_dir)
+ if link.url not in all_links
+ }
+ if orphaned_data_dir_links:
+ all_links.update(orphaned_data_dir_links)
+ print(' {lightyellow}β Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
+
+ # Links in invalid/duplicate data dirs
+ invalid_folders = {
+ folder: link
+ for folder, link in get_invalid_folders(all_links.values(), out_dir=out_dir).items()
+ }
+ if invalid_folders:
+ print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
+ print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
+ print()
+ print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
+ print(' archivebox info')
+ print(' archivebox list --status=invalid')
+
+
+ write_main_index(list(all_links.values()), out_dir=out_dir)
+
+ print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+ if existing_index:
+ print('{green}[β] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
+ else:
+ print('{green}[β] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
+ print()
+ print(' To view your archive index, open:')
+ print(' {}'.format(os.path.join(out_dir, HTML_INDEX_FILENAME)))
+ print()
+ print(' To add new links, you can run:')
+ print(" archivebox add 'https://example.com'")
+ print()
+ print(' For more usage and examples, run:')
+ print(' archivebox help')
+
+
+def info(out_dir: str=OUTPUT_DIR) -> None:
+ check_data_folder(out_dir=out_dir)
+
+ print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
+ print(f' {out_dir}/*')
+ num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
+ size = human_readable_size(num_bytes)
+ print(f' Size: {size} across {num_files} files')
+ print()
+
+ links = list(load_main_index(out_dir=out_dir))
+ num_json_links = len(links)
+ num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=out_dir))
+ num_html_links = sum(1 for url in parse_html_main_index(out_dir=out_dir))
+ num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
+ users = get_admins().values_list('username', flat=True)
+ print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})')
+ print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
+ print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
+ print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
+
+ print(f' > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
+
+ if num_html_links != len(links) or num_sql_links != len(links):
+ print()
+ print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
+ print(' archivebox init')
+
+ if not users:
+ print()
+ print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
+ print(' archivebox manage createsuperuser')
+
+ print()
+ print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI))
+ print(f' {ARCHIVE_DIR}/*')
+
+ num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
+ size = human_readable_size(num_bytes)
+ print(f' Size: {size} across {num_files} files in {num_dirs} directories')
+ print()
+
+ num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
+ num_archived = len(get_archived_folders(links, out_dir=out_dir))
+ num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
+ print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
+ print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
+ print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
+
+ num_present = len(get_present_folders(links, out_dir=out_dir))
+ num_valid = len(get_valid_folders(links, out_dir=out_dir))
+ print()
+ print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
+ print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
+
+ duplicate = get_duplicate_folders(links, out_dir=out_dir)
+ orphaned = get_orphaned_folders(links, out_dir=out_dir)
+ corrupted = get_corrupted_folders(links, out_dir=out_dir)
+ unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
+ num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
+ print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
+ print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
+ print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
+ print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
+ print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
+
+ if num_indexed:
+ print()
+ print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
+ print(' archivebox list --status= (e.g. indexed, corrupted, archived, etc.)')
+
+ if orphaned:
+ print()
+ print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
+ print(' archivebox init')
+
+ if num_invalid:
+ print()
+ print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
+ print(' archivebox init')
+
+ print()
+
+
+@enforce_types
+def add(import_str: Optional[str]=None,
+ import_path: Optional[str]=None,
+ update_all: bool=not ONLY_NEW,
+ index_only: bool=False,
+ out_dir: str=OUTPUT_DIR) -> List[Link]:
+ """The main ArchiveBox entrancepoint. Everything starts here."""
+
+ check_data_folder(out_dir=out_dir)
+
+ if import_str and import_path:
+ stderr(
+ '[X] You should pass either an import path as an argument, '
+ 'or pass a list of links via stdin, but not both.\n',
+ color='red',
+ )
+ raise SystemExit(2)
+ elif import_str:
+ import_path = save_stdin_to_sources(import_str, out_dir=out_dir)
+ else:
+ import_path = save_file_to_sources(import_path, out_dir=out_dir)
+
+ check_dependencies()
+
+ # Step 1: Load list of links from the existing index
+ # merge in and dedupe new links from import_path
+ all_links: List[Link] = []
+ new_links: List[Link] = []
+ all_links = load_main_index(out_dir=out_dir)
+ if import_path:
+ all_links, new_links = import_new_links(all_links, import_path, out_dir=out_dir)
+
+ # Step 2: Write updated index with deduped old and new links back to disk
+ write_main_index(links=all_links, out_dir=out_dir)
+
+ if index_only:
+ return all_links
+
+ # Step 3: Run the archive methods for each link
+ links = all_links if update_all else new_links
+ log_archiving_started(len(links))
+ idx: int = 0
+ link: Link = None # type: ignore
+ try:
+ for idx, link in enumerate(links):
+ archive_link(link, out_dir=link.link_dir)
+
+ except KeyboardInterrupt:
+ log_archiving_paused(len(links), idx, link.timestamp if link else '0')
+ raise SystemExit(0)
+
+ except:
+ print()
+ raise
+
+ log_archiving_finished(len(links))
+
+ # Step 4: Re-write links index with updated titles, icons, and resources
+ all_links = load_main_index(out_dir=out_dir)
+ write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
+ return all_links
+
+@enforce_types
+def remove(filter_str: Optional[str]=None,
+ filter_patterns: Optional[List[str]]=None,
+ filter_type: str='exact',
+ after: Optional[float]=None,
+ before: Optional[float]=None,
+ yes: bool=False,
+ delete: bool=False,
+ out_dir: str=OUTPUT_DIR) -> List[Link]:
+
+ check_data_folder(out_dir=out_dir)
+
+ if filter_str and filter_patterns:
+ stderr(
+ '[X] You should pass either a pattern as an argument, '
+ 'or pass a list of patterns via stdin, but not both.\n',
+ color='red',
+ )
+ raise SystemExit(2)
+ elif not (filter_str or filter_patterns):
+ stderr(
+ '[X] You should pass either a pattern as an argument, '
+ 'or pass a list of patterns via stdin.',
+ color='red',
+ )
+ stderr()
+ stderr(' {lightred}Hint:{reset} To remove all urls you can run:'.format(**ANSI))
+ stderr(" archivebox remove --filter-type=regex '.*'")
+ stderr()
+ raise SystemExit(2)
+ elif filter_str:
+ filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
+
+ log_list_started(filter_patterns, filter_type)
+ timer = TimedProgress(360, prefix=' ')
+ try:
+ links = list(list_links(
+ filter_patterns=filter_patterns,
+ filter_type=filter_type,
+ after=after,
+ before=before,
+ ))
+ finally:
+ timer.end()
+
+ if not len(links):
+ log_removal_finished(0, 0)
+ raise SystemExit(1)
+
+
+ log_list_finished(links)
+ log_removal_started(links, yes=yes, delete=delete)
+
+ timer = TimedProgress(360, prefix=' ')
+ try:
+ to_keep = []
+ all_links = load_main_index(out_dir=out_dir)
+ for link in all_links:
+ should_remove = (
+ (after is not None and float(link.timestamp) < after)
+ or (before is not None and float(link.timestamp) > before)
+ or link_matches_filter(link, filter_patterns, filter_type)
+ )
+ if not should_remove:
+ to_keep.append(link)
+ elif should_remove and delete:
+ shutil.rmtree(link.link_dir, ignore_errors=True)
+ finally:
+ timer.end()
+
+ write_main_index(links=to_keep, out_dir=out_dir, finished=True)
+ log_removal_finished(len(all_links), len(to_keep))
+
+ return to_keep
+
+@enforce_types
+def update(resume: Optional[float]=None,
+ only_new: bool=not ONLY_NEW,
+ index_only: bool=False,
+ overwrite: bool=False,
+ filter_patterns_str: Optional[str]=None,
+ filter_patterns: Optional[List[str]]=None,
+ filter_type: Optional[str]=None,
+ status: Optional[str]=None,
+ after: Optional[str]=None,
+ before: Optional[str]=None,
+ out_dir: str=OUTPUT_DIR) -> List[Link]:
+ """The main ArchiveBox entrancepoint. Everything starts here."""
+
+ check_dependencies()
+ check_data_folder(out_dir=out_dir)
+
+ # Step 1: Load list of links from the existing index
+ # merge in and dedupe new links from import_path
+ all_links: List[Link] = []
+ new_links: List[Link] = []
+ all_links = load_main_index(out_dir=out_dir)
+
+ # Step 2: Write updated index with deduped old and new links back to disk
+ write_main_index(links=list(all_links), out_dir=out_dir)
+
+ # Step 3: Filter for selected_links
+ matching_links = list_links(
+ filter_patterns=filter_patterns,
+ filter_type=filter_type,
+ before=before,
+ after=after,
+ )
+ matching_folders = list_folders(
+ links=list(matching_links),
+ status=status,
+ out_dir=out_dir,
+ )
+ all_links = [link for link in matching_folders.values() if link]
+
+ if index_only:
+ return all_links
+
+ # Step 3: Run the archive methods for each link
+ links = new_links if only_new else all_links
+ log_archiving_started(len(links), resume)
+ idx: int = 0
+ link: Link = None # type: ignore
+ try:
+ for idx, link in enumerate(links_after_timestamp(links, resume)):
+ archive_link(link, overwrite=overwrite, out_dir=link.link_dir)
+
+ except KeyboardInterrupt:
+ log_archiving_paused(len(links), idx, link.timestamp if link else '0')
+ raise SystemExit(0)
+
+ except:
+ print()
+ raise
+
+ log_archiving_finished(len(links))
+
+ # Step 4: Re-write links index with updated titles, icons, and resources
+ all_links = load_main_index(out_dir=out_dir)
+ write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
+ return all_links
+
+@enforce_types
+def list_all(filter_patterns_str: Optional[str]=None,
+ filter_patterns: Optional[List[str]]=None,
+ filter_type: str='exact',
+ status: Optional[str]=None,
+ after: Optional[float]=None,
+ before: Optional[float]=None,
+ sort: Optional[str]=None,
+ csv: Optional[str]=None,
+ json: Optional[str]=None,
+ out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
+
+ check_data_folder(out_dir=out_dir)
+
+ if filter_patterns and filter_patterns_str:
+ stderr(
+ '[X] You should either pass filter patterns as an arguments '
+ 'or via stdin, but not both.\n',
+ color='red',
+ )
+ raise SystemExit(2)
+ elif filter_patterns_str:
+ filter_patterns = filter_patterns_str.split('\n')
+
+
+ links = list_links(
+ filter_patterns=filter_patterns,
+ filter_type=filter_type,
+ before=before,
+ after=after,
+ )
+
+ if sort:
+ links = sorted(links, key=lambda link: getattr(link, sort))
+
+ folders = list_folders(
+ links=list(links),
+ status=status,
+ out_dir=out_dir,
+ )
+
+ if csv:
+ print(links_to_csv(folders.values(), csv_cols=csv.split(','), header=True))
+ elif json:
+ print(to_json(folders.values(), indent=4, sort_keys=True))
+ else:
+ print(folders_to_str(folders))
+ raise SystemExit(not folders)
+
+
+@enforce_types
+def list_links(filter_patterns: Optional[List[str]]=None,
+ filter_type: str='exact',
+ after: Optional[float]=None,
+ before: Optional[float]=None,
+ out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
+
+ check_data_folder(out_dir=out_dir)
+
+ all_links = load_main_index(out_dir=out_dir)
+
+ for link in all_links:
+ if after is not None and float(link.timestamp) < after:
+ continue
+ if before is not None and float(link.timestamp) > before:
+ continue
+
+ if filter_patterns:
+ if link_matches_filter(link, filter_patterns, filter_type):
+ yield link
+ else:
+ yield link
+
+@enforce_types
+def list_folders(links: List[Link],
+ status: str,
+ out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+
+ check_data_folder()
+
+ if status == 'indexed':
+ return get_indexed_folders(links, out_dir=out_dir)
+ elif status == 'archived':
+ return get_archived_folders(links, out_dir=out_dir)
+ elif status == 'unarchived':
+ return get_unarchived_folders(links, out_dir=out_dir)
+
+ elif status == 'present':
+ return get_present_folders(links, out_dir=out_dir)
+ elif status == 'valid':
+ return get_valid_folders(links, out_dir=out_dir)
+ elif status == 'invalid':
+ return get_invalid_folders(links, out_dir=out_dir)
+
+ elif status == 'duplicate':
+ return get_duplicate_folders(links, out_dir=out_dir)
+ elif status == 'orphaned':
+ return get_orphaned_folders(links, out_dir=out_dir)
+ elif status == 'corrupted':
+ return get_corrupted_folders(links, out_dir=out_dir)
+ elif status == 'unrecognized':
+ return get_unrecognized_folders(links, out_dir=out_dir)
+
+ raise ValueError('Status not recognized.')
+
+
+def config(config_options_str: Optional[str]=None,
+ config_options: Optional[List[str]]=None,
+ get: bool=False,
+ set: bool=False,
+ reset: bool=False,
+ out_dir: str=OUTPUT_DIR) -> None:
+
+ check_data_folder(out_dir=out_dir)
+
+ if config_options and config_options_str:
+ stderr(
+ '[X] You should either pass config values as an arguments '
+ 'or via stdin, but not both.\n',
+ color='red',
+ )
+ raise SystemExit(2)
+ elif config_options_str:
+ config_options = stdin_raw_text.split('\n')
+
+ config_options = config_options or []
+
+ no_args = not (get or set or reset or config_options)
+
+ matching_config: ConfigDict = {}
+ if get or no_args:
+ if config_options:
+ config_options = [get_real_name(key) for key in config_options]
+ matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
+ failed_config = [key for key in config_options if key not in CONFIG]
+ if failed_config:
+ stderr()
+ stderr('[X] These options failed to get', color='red')
+ stderr(' {}'.format('\n '.join(config_options)))
+ raise SystemExit(1)
+ else:
+ matching_config = CONFIG
+
+ print(printable_config(matching_config))
+ raise SystemExit(not matching_config)
+ elif set:
+ new_config = {}
+ failed_options = []
+ for line in config_options:
+ if line.startswith('#') or not line.strip():
+ continue
+ if '=' not in line:
+ stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
+ stderr(f' {line}')
+ raise SystemExit(2)
+
+ raw_key, val = line.split('=')
+ raw_key = raw_key.upper().strip()
+ key = get_real_name(raw_key)
+ if key != raw_key:
+ stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
+
+ if key in CONFIG:
+ new_config[key] = val.strip()
+ else:
+ failed_options.append(line)
+
+ if new_config:
+ before = CONFIG
+ matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
+ after = load_all_config()
+ print(printable_config(matching_config))
+
+ side_effect_changes: ConfigDict = {}
+ for key, val in after.items():
+ if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
+ side_effect_changes[key] = after[key]
+
+ if side_effect_changes:
+ stderr()
+ stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
+ print(' {}'.format(printable_config(side_effect_changes, prefix=' ')))
+ if failed_options:
+ stderr()
+ stderr('[X] These options failed to set:', color='red')
+ stderr(' {}'.format('\n '.join(failed_options)))
+ raise SystemExit(bool(failed_options))
+ elif reset:
+ stderr('[X] This command is not implemented yet.', color='red')
+ stderr(' Please manually remove the relevant lines from your config file:')
+ stderr(f' {CONFIG_FILE}')
+ raise SystemExit(2)
+
+ else:
+ stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
+ stderr(' archivebox config')
+ stderr(' archivebox config --get SOME_KEY')
+ stderr(' archivebox config --set SOME_KEY=SOME_VALUE')
+ raise SystemExit(2)
+
+
+CRON_COMMENT = 'archivebox_schedule'
+
+@enforce_types
+def schedule(add: bool=False,
+ show: bool=False,
+ clear: bool=False,
+ foreground: bool=False,
+ run_all: bool=False,
+ quiet: bool=False,
+ every: Optional[str]=None,
+ import_path: Optional[str]=None,
+ out_dir: str=OUTPUT_DIR):
+
+ check_data_folder(out_dir=out_dir)
+
+ os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True)
+
+ cron = CronTab(user=True)
+ cron = dedupe_jobs(cron)
+
+ existing_jobs = list(cron.find_comment(CRON_COMMENT))
+ if foreground or run_all:
+ if import_path or (not existing_jobs):
+ stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
+ stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
+ raise SystemExit(1)
+ print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
+ if run_all:
+ try:
+ for job in existing_jobs:
+ sys.stdout.write(f' > {job.command}')
+ sys.stdout.flush()
+ job.run()
+ sys.stdout.write(f'\r β {job.command}\n')
+ except KeyboardInterrupt:
+ print('\n{green}[β] Stopped.{reset}'.format(**ANSI))
+ raise SystemExit(1)
+ if foreground:
+ try:
+ for result in cron.run_scheduler():
+ print(result)
+ except KeyboardInterrupt:
+ print('\n{green}[β] Stopped.{reset}'.format(**ANSI))
+ raise SystemExit(1)
+
+ elif show:
+ if existing_jobs:
+ print('\n'.join(str(cmd) for cmd in existing_jobs))
+ else:
+ stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
+ stderr(' To schedule a new job, run:')
+ stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
+ raise SystemExit(0)
+
+ elif clear:
+ print(cron.remove_all(comment=CRON_COMMENT))
+ cron.write()
+ raise SystemExit(0)
+
+ elif every:
+ quoted = lambda s: f'"{s}"' if s and ' ' in s else s
+ cmd = [
+ 'cd',
+ quoted(out_dir),
+ '&&',
+ quoted(ARCHIVEBOX_BINARY),
+ *(['add', f'"{import_path}"'] if import_path else ['update']),
+ '2>&1',
+ '>',
+ quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
+
+ ]
+ new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
+
+ if every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
+ set_every = getattr(new_job.every(), every)
+ set_every()
+ elif CronSlices.is_valid(every):
+ new_job.setall(every)
+ else:
+ stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
+ stderr(' It must be one of minute/hour/day/week/month')
+ stderr(' or a quoted cron-format schedule like:')
+ stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml')
+ stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
+ raise SystemExit(1)
+
+ cron = dedupe_jobs(cron)
+ cron.write()
+
+ total_runs = sum(j.frequency_per_year() for j in cron)
+ existing_jobs = list(cron.find_comment(CRON_COMMENT))
+
+ print()
+ print('{green}[β] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
+ print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
+ if total_runs > 60 and not quiet:
+ stderr()
+ stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
+ stderr(f' Congrats on being an enthusiastic internet archiver! π')
+ stderr()
+ stderr(' Make sure you have enough storage space available to hold all the data.')
+ stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
+ raise SystemExit(0)
+
+
+
+
+
+def server(runserver_args: Optional[List[str]]=None, reload: bool=False, out_dir: str=OUTPUT_DIR) -> None:
+ runserver_args = runserver_args or []
+ check_data_folder(out_dir=out_dir)
+
+ setup_django(out_dir)
+ from django.core.management import call_command
+ from django.contrib.auth.models import User
+
+ if IS_TTY and not User.objects.filter(is_superuser=True).exists():
+ print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
+ print()
+ print(' To create an admin user, run:')
+ print(' archivebox manage createsuperuser')
+ print()
+
+ print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
+ if not reload:
+ runserver_args.append('--noreload')
+
+ call_command("runserver", *runserver_args)
+
+
+def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None:
+ check_data_folder(out_dir=out_dir)
+
+ setup_django(out_dir)
+ from django.core.management import execute_from_command_line
+
+ execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
+
+def shell(out_dir: str=OUTPUT_DIR) -> None:
+ check_data_folder(out_dir=out_dir)
+
+ setup_django(OUTPUT_DIR)
+ from django.core.management import call_command
+ call_command("shell_plus")
+
+# Helpers
+
+def printable_config(config: ConfigDict, prefix: str='') -> str:
+ return f'\n{prefix}'.join(
+ f'{key}={val}'
+ for key, val in config.items()
+ if not (isinstance(val, dict) or callable(val))
+ )
+
+def dedupe_jobs(cron: CronTab) -> CronTab:
+ deduped: Set[Tuple[str, str]] = set()
+
+ for job in list(cron):
+ unique_tuple = (str(job.slices), job.command)
+ if unique_tuple not in deduped:
+ deduped.add(unique_tuple)
+ cron.remove(job)
+
+ for schedule, command in deduped:
+ job = cron.new(command=command, comment=CRON_COMMENT)
+ job.setall(schedule)
+ job.enable()
+
+ return cron
+
+
+def print_folder_status(name, folder):
+ if folder['enabled']:
+ if folder['is_valid']:
+ color, symbol, note = 'green', 'β', 'valid'
+ else:
+ color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
+ else:
+ color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
+
+ if folder['path']:
+ if os.path.exists(folder['path']):
+ num_files = (
+ f'{len(os.listdir(folder["path"]))} files'
+ if os.path.isdir(folder['path']) else
+ human_readable_size(os.path.getsize(folder['path']))
+ )
+ else:
+ num_files = 'missing'
+
+ if ' ' in folder['path']:
+ folder['path'] = f'"{folder["path"]}"'
+
+ print(
+ ANSI[color],
+ symbol,
+ ANSI['reset'],
+ name.ljust(22),
+ (folder["path"] or '').ljust(76),
+ num_files.ljust(14),
+ ANSI[color],
+ note,
+ ANSI['reset'],
+ )
+
+
+def print_dependency_version(name, dependency):
+ if dependency['enabled']:
+ if dependency['is_valid']:
+ color, symbol, note = 'green', 'β', 'valid'
+ version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
+ else:
+ color, symbol, note, version = 'red', 'X', 'invalid', '?'
+ else:
+ color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
+
+ if ' ' in dependency["path"]:
+ dependency["path"] = f'"{dependency["path"]}"'
+
+ print(
+ ANSI[color],
+ symbol,
+ ANSI['reset'],
+ name.ljust(22),
+ (dependency["path"] or '').ljust(76),
+ version.ljust(14),
+ ANSI[color],
+ note,
+ ANSI['reset'],
+ )
diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py
new file mode 100644
index 00000000..2a20ff6d
--- /dev/null
+++ b/archivebox/parsers/__init__.py
@@ -0,0 +1,68 @@
+"""
+Everything related to parsing links from input sources.
+
+For a list of supported services, see the README.md.
+For examples of supported import formats see tests/.
+"""
+
+__package__ = 'archivebox.parsers'
+
+
+from typing import Tuple, List
+
+from ..config import TIMEOUT
+from ..util import (
+ check_url_parsing_invariants,
+ TimedProgress,
+ Link,
+ enforce_types,
+)
+from .pocket_html import parse_pocket_html_export
+from .pinboard_rss import parse_pinboard_rss_export
+from .shaarli_rss import parse_shaarli_rss_export
+from .medium_rss import parse_medium_rss_export
+from .netscape_html import parse_netscape_html_export
+from .generic_rss import parse_generic_rss_export
+from .generic_json import parse_generic_json_export
+from .generic_txt import parse_generic_txt_export
+
+
+@enforce_types
+def parse_links(source_file: str) -> Tuple[List[Link], str]:
+ """parse a list of URLs with their metadata from an
+ RSS feed, bookmarks export, or text file
+ """
+
+ check_url_parsing_invariants()
+ PARSERS = (
+ # Specialized parsers
+ ('Pocket HTML', parse_pocket_html_export),
+ ('Pinboard RSS', parse_pinboard_rss_export),
+ ('Shaarli RSS', parse_shaarli_rss_export),
+ ('Medium RSS', parse_medium_rss_export),
+
+ # General parsers
+ ('Netscape HTML', parse_netscape_html_export),
+ ('Generic RSS', parse_generic_rss_export),
+ ('Generic JSON', parse_generic_json_export),
+
+ # Fallback parser
+ ('Plain Text', parse_generic_txt_export),
+ )
+ timer = TimedProgress(TIMEOUT * 4)
+ with open(source_file, 'r', encoding='utf-8') as file:
+ for parser_name, parser_func in PARSERS:
+ try:
+ links = list(parser_func(file))
+ if links:
+ timer.end()
+ return links, parser_name
+ except Exception as err: # noqa
+ # Parsers are tried one by one down the list, and the first one
+ # that succeeds is used. To see why a certain parser was not used
+ # due to error or format incompatibility, uncomment this line:
+ # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
+ pass
+
+ timer.end()
+ return [], 'Failed to parse'
diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py
new file mode 100644
index 00000000..8b20e6f4
--- /dev/null
+++ b/archivebox/parsers/generic_json.py
@@ -0,0 +1,65 @@
+__package__ = 'archivebox.parsers'
+
+import json
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+ htmldecode,
+ enforce_types,
+)
+
+
+@enforce_types
+def parse_generic_json_export(json_file: IO[str]) -> Iterable[Link]:
+ """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
+
+ json_file.seek(0)
+ links = json.load(json_file)
+ json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
+
+ for link in links:
+ # example line
+ # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
+ if link:
+ # Parse URL
+ url = link.get('href') or link.get('url') or link.get('URL')
+ if not url:
+ raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
+
+ # Parse the timestamp
+ ts_str = str(datetime.now().timestamp())
+ if link.get('timestamp'):
+ # chrome/ff histories use a very precise timestamp
+ ts_str = str(link['timestamp'] / 10000000)
+ elif link.get('time'):
+ ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
+ elif link.get('created_at'):
+ ts_str = str(json_date(link['created_at']).timestamp())
+ elif link.get('created'):
+ ts_str = str(json_date(link['created']).timestamp())
+ elif link.get('date'):
+ ts_str = str(json_date(link['date']).timestamp())
+ elif link.get('bookmarked'):
+ ts_str = str(json_date(link['bookmarked']).timestamp())
+ elif link.get('saved'):
+ ts_str = str(json_date(link['saved']).timestamp())
+
+ # Parse the title
+ title = None
+ if link.get('title'):
+ title = link['title'].strip()
+ elif link.get('description'):
+ title = link['description'].replace(' β Readability', '').strip()
+ elif link.get('name'):
+ title = link['name'].strip()
+
+ yield Link(
+ url=htmldecode(url),
+ timestamp=ts_str,
+ title=htmldecode(title) or None,
+ tags=htmldecode(link.get('tags')) or '',
+ sources=[json_file.name],
+ )
diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py
new file mode 100644
index 00000000..3a62bb88
--- /dev/null
+++ b/archivebox/parsers/generic_rss.py
@@ -0,0 +1,49 @@
+__package__ = 'archivebox.parsers'
+
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+ htmldecode,
+ enforce_types,
+ str_between,
+)
+
+@enforce_types
+def parse_generic_rss_export(rss_file: IO[str]) -> Iterable[Link]:
+ """Parse RSS XML-format files into links"""
+
+ rss_file.seek(0)
+ items = rss_file.read().split('- ')
+ items = items[1:] if items else []
+ for item in items:
+ # example item:
+ #
-
+ #
+ # Unread
+ # https://blog.sessionstack.com/how-javascript-works-inside
+ # https://blog.sessionstack.com/how-javascript-works-inside
+ # Mon, 21 Aug 2017 14:21:58 -0500
+ #
+
+ trailing_removed = item.split(' ', 1)[0]
+ leading_removed = trailing_removed.split('- ', 1)[-1].strip()
+ rows = leading_removed.split('\n')
+
+ def get_row(key):
+ return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
+
+ url = str_between(get_row('link'), '', '')
+ ts_str = str_between(get_row('pubDate'), '', '')
+ time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
+ title = str_between(get_row('title'), ' Iterable[Link]:
+ """Parse raw links from each line in a text file"""
+
+ text_file.seek(0)
+ for line in text_file.readlines():
+ urls = re.findall(URL_REGEX, line) if line.strip() else ()
+ for url in urls: # type: ignore
+ yield Link(
+ url=htmldecode(url),
+ timestamp=str(datetime.now().timestamp()),
+ title=None,
+ tags=None,
+ sources=[text_file.name],
+ )
diff --git a/archivebox/parsers/medium_rss.py b/archivebox/parsers/medium_rss.py
new file mode 100644
index 00000000..11379677
--- /dev/null
+++ b/archivebox/parsers/medium_rss.py
@@ -0,0 +1,35 @@
+__package__ = 'archivebox.parsers'
+
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from xml.etree import ElementTree
+
+from ..index.schema import Link
+from ..util import (
+ htmldecode,
+ enforce_types,
+)
+
+
+@enforce_types
+def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
+ """Parse Medium RSS feed files into links"""
+
+ rss_file.seek(0)
+ root = ElementTree.parse(rss_file).getroot()
+ items = root.find("channel").findall("item") # type: ignore
+ for item in items:
+ url = item.find("link").text # type: ignore
+ title = item.find("title").text.strip() # type: ignore
+ ts_str = item.find("pubDate").text # type: ignore
+ time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
+
+ yield Link(
+ url=htmldecode(url),
+ timestamp=str(time.timestamp()),
+ title=htmldecode(title) or None,
+ tags=None,
+ sources=[rss_file.name],
+ )
diff --git a/archivebox/parsers/netscape_html.py b/archivebox/parsers/netscape_html.py
new file mode 100644
index 00000000..894e2318
--- /dev/null
+++ b/archivebox/parsers/netscape_html.py
@@ -0,0 +1,39 @@
+__package__ = 'archivebox.parsers'
+
+
+import re
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+ htmldecode,
+ enforce_types,
+)
+
+
+@enforce_types
+def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
+ """Parse netscape-format bookmarks export files (produced by all browsers)"""
+
+ html_file.seek(0)
+ pattern = re.compile("]*>(.+)", re.UNICODE | re.IGNORECASE)
+ for line in html_file:
+ # example line
+ # example bookmark title
+
+ match = pattern.search(line)
+ if match:
+ url = match.group(1)
+ time = datetime.fromtimestamp(float(match.group(2)))
+ title = match.group(3).strip()
+
+ yield Link(
+ url=htmldecode(url),
+ timestamp=str(time.timestamp()),
+ title=htmldecode(title) or None,
+ tags=None,
+ sources=[html_file.name],
+ )
+
diff --git a/archivebox/parsers/pinboard_rss.py b/archivebox/parsers/pinboard_rss.py
new file mode 100644
index 00000000..eb21c7ef
--- /dev/null
+++ b/archivebox/parsers/pinboard_rss.py
@@ -0,0 +1,47 @@
+__package__ = 'archivebox.parsers'
+
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from xml.etree import ElementTree
+
+from ..index.schema import Link
+from ..util import (
+ htmldecode,
+ enforce_types,
+)
+
+
+@enforce_types
+def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
+ """Parse Pinboard RSS feed files into links"""
+
+ rss_file.seek(0)
+ root = ElementTree.parse(rss_file).getroot()
+ items = root.findall("{http://purl.org/rss/1.0/}item")
+ for item in items:
+ find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore
+
+ url = find("{http://purl.org/rss/1.0/}link")
+ tags = find("{http://purl.org/dc/elements/1.1/}subject")
+ title = find("{http://purl.org/rss/1.0/}title")
+ ts_str = find("{http://purl.org/dc/elements/1.1/}date")
+
+ # Pinboard includes a colon in its date stamp timezone offsets, which
+ # Python can't parse. Remove it:
+ if ts_str and ts_str[-3:-2] == ":":
+ ts_str = ts_str[:-3]+ts_str[-2:]
+
+ if ts_str:
+ time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
+ else:
+ time = datetime.now()
+
+ yield Link(
+ url=htmldecode(url),
+ timestamp=str(time.timestamp()),
+ title=htmldecode(title) or None,
+ tags=htmldecode(tags) or None,
+ sources=[rss_file.name],
+ )
diff --git a/archivebox/parsers/pocket_html.py b/archivebox/parsers/pocket_html.py
new file mode 100644
index 00000000..3eae58c4
--- /dev/null
+++ b/archivebox/parsers/pocket_html.py
@@ -0,0 +1,38 @@
+__package__ = 'archivebox.parsers'
+
+
+import re
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+ htmldecode,
+ enforce_types,
+)
+
+
+@enforce_types
+def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
+ """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
+
+ html_file.seek(0)
+ pattern = re.compile("^\\s*(.+)", re.UNICODE)
+ for line in html_file:
+ # example line
+ # example title
+ match = pattern.search(line)
+ if match:
+ url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
+ time = datetime.fromtimestamp(float(match.group(2)))
+ tags = match.group(3)
+ title = match.group(4).replace(' β Readability', '').replace('http://www.readability.com/read?url=', '')
+
+ yield Link(
+ url=htmldecode(url),
+ timestamp=str(time.timestamp()),
+ title=htmldecode(title) or None,
+ tags=tags or '',
+ sources=[html_file.name],
+ )
diff --git a/archivebox/parsers/shaarli_rss.py b/archivebox/parsers/shaarli_rss.py
new file mode 100644
index 00000000..ae5bfa96
--- /dev/null
+++ b/archivebox/parsers/shaarli_rss.py
@@ -0,0 +1,50 @@
+__package__ = 'archivebox.parsers'
+
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+ htmldecode,
+ enforce_types,
+ str_between,
+)
+
+
+@enforce_types
+def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
+ """Parse Shaarli-specific RSS XML-format files into links"""
+
+ rss_file.seek(0)
+ entries = rss_file.read().split('')[1:]
+ for entry in entries:
+ # example entry:
+ #
+ # Aktuelle Trojaner-Welle: Emotet lauert in gefΓΒ€lschten Rechnungsmails | heise online
+ #
+ # https://demo.shaarli.org/?cEV4vw
+ # 2019-01-30T06:06:01+00:00
+ # 2019-01-30T06:06:01+00:00
+ #
— Permalink
]]>
+ #
+
+ trailing_removed = entry.split('', 1)[0]
+ leading_removed = trailing_removed.strip()
+ rows = leading_removed.split('\n')
+
+ def get_row(key):
+ return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
+
+ title = str_between(get_row('title'), '', '').strip()
+ url = str_between(get_row('link'), '')
+ ts_str = str_between(get_row('published'), '', '')
+ time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
+
+ yield Link(
+ url=htmldecode(url),
+ timestamp=str(time.timestamp()),
+ title=htmldecode(title) or None,
+ tags=None,
+ sources=[rss_file.name],
+ )
diff --git a/archivebox/legacy/templates/favicon.ico b/archivebox/themes/legacy/favicon.ico
similarity index 100%
rename from archivebox/legacy/templates/favicon.ico
rename to archivebox/themes/legacy/favicon.ico
diff --git a/archivebox/legacy/templates/link_details.html b/archivebox/themes/legacy/link_details.html
similarity index 100%
rename from archivebox/legacy/templates/link_details.html
rename to archivebox/themes/legacy/link_details.html
diff --git a/archivebox/legacy/templates/main_index.html b/archivebox/themes/legacy/main_index.html
similarity index 100%
rename from archivebox/legacy/templates/main_index.html
rename to archivebox/themes/legacy/main_index.html
diff --git a/archivebox/legacy/templates/main_index_row.html b/archivebox/themes/legacy/main_index_row.html
similarity index 100%
rename from archivebox/legacy/templates/main_index_row.html
rename to archivebox/themes/legacy/main_index_row.html
diff --git a/archivebox/legacy/templates/robots.txt b/archivebox/themes/legacy/robots.txt
similarity index 100%
rename from archivebox/legacy/templates/robots.txt
rename to archivebox/themes/legacy/robots.txt
diff --git a/archivebox/legacy/templates/static/archive.png b/archivebox/themes/legacy/static/archive.png
similarity index 100%
rename from archivebox/legacy/templates/static/archive.png
rename to archivebox/themes/legacy/static/archive.png
diff --git a/archivebox/legacy/templates/static/bootstrap.min.css b/archivebox/themes/legacy/static/bootstrap.min.css
similarity index 100%
rename from archivebox/legacy/templates/static/bootstrap.min.css
rename to archivebox/themes/legacy/static/bootstrap.min.css
diff --git a/archivebox/legacy/templates/static/external.png b/archivebox/themes/legacy/static/external.png
similarity index 100%
rename from archivebox/legacy/templates/static/external.png
rename to archivebox/themes/legacy/static/external.png
diff --git a/archivebox/legacy/templates/static/jquery.dataTables.min.css b/archivebox/themes/legacy/static/jquery.dataTables.min.css
similarity index 100%
rename from archivebox/legacy/templates/static/jquery.dataTables.min.css
rename to archivebox/themes/legacy/static/jquery.dataTables.min.css
diff --git a/archivebox/legacy/templates/static/jquery.dataTables.min.js b/archivebox/themes/legacy/static/jquery.dataTables.min.js
similarity index 100%
rename from archivebox/legacy/templates/static/jquery.dataTables.min.js
rename to archivebox/themes/legacy/static/jquery.dataTables.min.js
diff --git a/archivebox/legacy/templates/static/jquery.min.js b/archivebox/themes/legacy/static/jquery.min.js
similarity index 100%
rename from archivebox/legacy/templates/static/jquery.min.js
rename to archivebox/themes/legacy/static/jquery.min.js
diff --git a/archivebox/legacy/templates/static/sort_asc.png b/archivebox/themes/legacy/static/sort_asc.png
similarity index 100%
rename from archivebox/legacy/templates/static/sort_asc.png
rename to archivebox/themes/legacy/static/sort_asc.png
diff --git a/archivebox/legacy/templates/static/sort_both.png b/archivebox/themes/legacy/static/sort_both.png
similarity index 100%
rename from archivebox/legacy/templates/static/sort_both.png
rename to archivebox/themes/legacy/static/sort_both.png
diff --git a/archivebox/legacy/templates/static/sort_desc.png b/archivebox/themes/legacy/static/sort_desc.png
similarity index 100%
rename from archivebox/legacy/templates/static/sort_desc.png
rename to archivebox/themes/legacy/static/sort_desc.png
diff --git a/archivebox/legacy/templates/static/spinner.gif b/archivebox/themes/legacy/static/spinner.gif
similarity index 100%
rename from archivebox/legacy/templates/static/spinner.gif
rename to archivebox/themes/legacy/static/spinner.gif
diff --git a/archivebox/legacy/util.py b/archivebox/util.py
similarity index 93%
rename from archivebox/legacy/util.py
rename to archivebox/util.py
index 327f0270..447b9eff 100644
--- a/archivebox/legacy/util.py
+++ b/archivebox/util.py
@@ -1,6 +1,7 @@
import os
import re
import sys
+import ssl
import json
import time
import shutil
@@ -8,7 +9,7 @@ import argparse
from string import Template
from json import JSONEncoder
-from typing import List, Optional, Any, Union, IO, Mapping, Tuple
+from typing import List, Dict, Optional, Any, Union, IO, Mapping, Tuple
from inspect import signature
from functools import wraps
from hashlib import sha256
@@ -28,11 +29,12 @@ from subprocess import (
from base32_crockford import encode as base32_encode # type: ignore
-from .schema import Link
+from .index.schema import Link
from .config import (
ANSI,
TERM_WIDTH,
- SOURCES_DIR,
+ OUTPUT_DIR,
+ SOURCES_DIR_NAME,
OUTPUT_PERMISSIONS,
TIMEOUT,
SHOW_PROGRESS,
@@ -40,8 +42,9 @@ from .config import (
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
CHROME_OPTIONS,
+ check_data_folder,
)
-from .logs import pretty_path
+from .cli.logging import pretty_path
### Parsing Helpers
@@ -187,31 +190,36 @@ def check_url_parsing_invariants() -> None:
### Random Helpers
@enforce_types
-def handle_stdin_import(raw_text: str) -> str:
- if not os.path.exists(SOURCES_DIR):
- os.makedirs(SOURCES_DIR)
+def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
+ check_data_folder(out_dir=out_dir)
+
+ sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
+ if not os.path.exists(sources_dir):
+ os.makedirs(sources_dir)
ts = str(datetime.now().timestamp()).split('.', 1)[0]
- source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
+ source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
atomic_write(raw_text, source_path)
return source_path
@enforce_types
-def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
+def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
"""download a given url's content into output/sources/domain-.txt"""
+ check_data_folder(out_dir=out_dir)
- if not os.path.exists(SOURCES_DIR):
- os.makedirs(SOURCES_DIR)
+ sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
+ if not os.path.exists(sources_dir):
+ os.makedirs(sources_dir)
ts = str(datetime.now().timestamp()).split('.', 1)[0]
- source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
+ source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
- source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
+ source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
print('{}[*] [{}] Downloading {}{}'.format(
ANSI['green'],
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@@ -532,7 +540,6 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str:
if CHECK_SSL_VALIDITY:
resp = urlopen(req, timeout=timeout)
else:
- import ssl
insecure = ssl._create_unverified_context()
resp = urlopen(req, timeout=timeout, context=insecure)
@@ -662,7 +669,7 @@ def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=Tr
return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
-def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
+def links_to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
header: bool=True, ljust: int=0, separator: str=',') -> str:
csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
@@ -677,6 +684,8 @@ def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
return '\n'.join((header_str, *row_strs))
+def folders_to_str(folders: Dict[str, Optional[Link]]) -> str:
+ return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
@enforce_types
def render_template(template_path: str, context: Mapping[str, str]) -> str:
@@ -713,11 +722,11 @@ def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
os.remove(tmp_file)
-def reject_stdin(caller: str) -> None:
+def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
"""Tell the user they passed stdin to a command that doesn't accept it"""
- if not sys.stdin.isatty():
- stdin_raw_text = sys.stdin.read().strip()
+ if stdin and not stdin.isatty():
+ stdin_raw_text = stdin.read().strip()
if stdin_raw_text:
print(
'{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
@@ -731,9 +740,30 @@ def reject_stdin(caller: str) -> None:
print()
raise SystemExit(1)
+def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
+ if stdin and not stdin.isatty():
+ return stdin.read()
+ return None
+
+
+def set_docstring(text: str):
+ def decorator(func):
+ @wraps(func)
+ def wrapper_with_docstring(*args, **kwargs):
+ return func(*args, **kwargs)
+ wrapper_with_docstring.__doc__ = text
+ return wrapper_with_docstring
+ return decorator
+
class SmartFormatter(argparse.HelpFormatter):
def _split_lines(self, text, width):
if '\n' in text:
return text.splitlines()
return argparse.HelpFormatter._split_lines(self, text, width)
+
+
+class ArchiveError(Exception):
+ def __init__(self, message, hints=None):
+ super().__init__(message)
+ self.hints = hints