From cdb70c73df0b593e08e00f6191e349fbbe3494c1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 17 Apr 2019 03:49:18 -0400 Subject: [PATCH] first working django model with archivebox-shell command and sql exporting --- archivebox/__init__.py | 2 ++ archivebox/cli/archivebox_remove.py | 5 ++- archivebox/cli/archivebox_shell.py | 31 ++++++++++++++++++ archivebox/core/__init__.py | 1 + archivebox/core/migrations/0001_initial.py | 28 ++++++++++++++++ .../migrations/0002_auto_20190417_0739.py | 27 ++++++++++++++++ archivebox/core/models.py | 32 ++++++++++++++++++- archivebox/core/settings.py | 24 +++++++------- archivebox/legacy/config.py | 14 ++++++-- archivebox/legacy/index.py | 16 ++++++++++ archivebox/legacy/main.py | 6 ++++ archivebox/legacy/mypy_django.ini | 10 ++++++ archivebox/legacy/storage/sql.py | 32 +++++++++++++++++++ archivebox/mypy.ini | 3 ++ archivebox/tests.py | 1 + requirements.txt | 1 + setup.py | 3 +- 17 files changed, 215 insertions(+), 21 deletions(-) create mode 100644 archivebox/cli/archivebox_shell.py create mode 100644 archivebox/core/migrations/0001_initial.py create mode 100644 archivebox/core/migrations/0002_auto_20190417_0739.py create mode 100644 archivebox/legacy/mypy_django.ini create mode 100644 archivebox/legacy/storage/sql.py create mode 100644 archivebox/mypy.ini diff --git a/archivebox/__init__.py b/archivebox/__init__.py index b0c00b61..4cd3afd5 100644 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -1 +1,3 @@ __package__ = 'archivebox' + +from . import core diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index d2b792f5..26bf8262 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -8,9 +8,8 @@ import sys import argparse -from ..legacy.main import list_archive_data, remove_archive_links -from ..legacy.util import reject_stdin, to_csv, TimedProgress -from ..legacy.config import ANSI +from ..legacy.main import remove_archive_links +from ..legacy.util import reject_stdin def main(args=None): diff --git a/archivebox/cli/archivebox_shell.py b/archivebox/cli/archivebox_shell.py new file mode 100644 index 00000000..6fc84c40 --- /dev/null +++ b/archivebox/cli/archivebox_shell.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox shell' +__description__ = 'Enter an interactive ArchiveBox Django shell' + +import sys +import argparse + +from ..legacy.config import setup_django +from ..legacy.util import reject_stdin + + +def main(args=None): + args = sys.argv[1:] if args is None else args + + parser = argparse.ArgumentParser( + prog=__command__, + description=__description__, + add_help=True, + ) + parser.parse_args(args) + reject_stdin(__command__) + + setup_django() + from django.core.management import call_command + call_command("shell_plus") + + +if __name__ == '__main__': + main() diff --git a/archivebox/core/__init__.py b/archivebox/core/__init__.py index e69de29b..3e1d607a 100644 --- a/archivebox/core/__init__.py +++ b/archivebox/core/__init__.py @@ -0,0 +1 @@ +__package__ = 'archivebox.core' diff --git a/archivebox/core/migrations/0001_initial.py b/archivebox/core/migrations/0001_initial.py new file mode 100644 index 00000000..366db56c --- /dev/null +++ b/archivebox/core/migrations/0001_initial.py @@ -0,0 +1,28 @@ +# Generated by Django 2.2 on 2019-04-17 06:46 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Page', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), + ('url', models.URLField()), + ('timestamp', models.CharField(default=None, max_length=32, null=True)), + ('title', models.CharField(default=None, max_length=128, null=True)), + ('tags', models.CharField(default=None, max_length=256, null=True)), + ('added', models.DateTimeField(auto_now_add=True)), + ('bookmarked', models.DateTimeField()), + ('updated', models.DateTimeField(default=None, null=True)), + ], + ), + ] diff --git a/archivebox/core/migrations/0002_auto_20190417_0739.py b/archivebox/core/migrations/0002_auto_20190417_0739.py new file mode 100644 index 00000000..a265c13d --- /dev/null +++ b/archivebox/core/migrations/0002_auto_20190417_0739.py @@ -0,0 +1,27 @@ +# Generated by Django 2.2 on 2019-04-17 07:39 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0001_initial'), + ] + + operations = [ + migrations.RemoveField( + model_name='page', + name='bookmarked', + ), + migrations.AlterField( + model_name='page', + name='timestamp', + field=models.CharField(default=None, max_length=32, null=True, unique=True), + ), + migrations.AlterField( + model_name='page', + name='url', + field=models.URLField(unique=True), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 71a83623..1951c37d 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1,3 +1,33 @@ +__package__ = 'archivebox.core' + +import uuid + from django.db import models -# Create your models here. + +class Page(models.Model): + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) + + url = models.URLField(unique=True) + timestamp = models.CharField(unique=True, max_length=32, null=True, default=None) + + title = models.CharField(max_length=128, null=True, default=None) + tags = models.CharField(max_length=256, null=True, default=None) + + added = models.DateTimeField(auto_now_add=True) + updated = models.DateTimeField(null=True, default=None) + # bookmarked = models.DateTimeField() + + sql_args = ('url', 'timestamp', 'title', 'tags', 'updated') + + @classmethod + def from_json(cls, info: dict): + info = {k: v for k, v in info.items() if k in cls.sql_args} + return cls(**info) + + def as_json(self, *args) -> dict: + args = args or self.sql_args + return { + key: getattr(self, key) + for key in args + } diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index b7ffbe18..b168e6e2 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -1,24 +1,22 @@ __package__ = 'archivebox.core' -from ..legacy.config import ( - TEMPLATES_DIR, - DATABASE_FILE, -) - +import os SECRET_KEY = '---------------- not a valid secret key ! ----------------' DEBUG = True INSTALLED_APPS = [ - # 'django.contrib.admin', - # 'django.contrib.auth', - # 'django.contrib.contenttypes', - # 'django.contrib.sessions', - # 'django.contrib.messages', - # 'django.contrib.staticfiles', + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', 'core', + + 'django_extensions', ] MIDDLEWARE = [ @@ -35,7 +33,7 @@ ROOT_URLCONF = 'core.urls' TEMPLATES = [ { 'BACKEND': 'django.template.backends.django.DjangoTemplates', - 'DIRS': [TEMPLATES_DIR], + 'DIRS': ['templates'], 'APP_DIRS': True, 'OPTIONS': { 'context_processors': [ @@ -53,7 +51,7 @@ WSGI_APPLICATION = 'core.wsgi.application' DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': DATABASE_FILE, + 'NAME': os.path.join(os.path.abspath(os.curdir), 'database', 'database.sqlite3'), } } diff --git a/archivebox/legacy/config.py b/archivebox/legacy/config.py index c158e52b..8842b793 100644 --- a/archivebox/legacy/config.py +++ b/archivebox/legacy/config.py @@ -1,14 +1,15 @@ +__package__ = 'archivebox.legacy' + import os import re import sys -import getpass import django +import getpass import shutil from typing import Optional from subprocess import run, PIPE, DEVNULL - # ****************************************************************************** # Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration # Use the 'env' command to pass config options to ArchiveBox. e.g.: @@ -93,10 +94,11 @@ else: ARCHIVE_DIR_NAME = 'archive' SOURCES_DIR_NAME = 'sources' DATABASE_DIR_NAME = 'database' +DATABASE_FILE_NAME = 'database.sqlite3' ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME) SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME) DATABASE_DIR = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME) -DATABASE_FILE = os.path.join(DATABASE_DIR, 'database.sqlite3') +DATABASE_FILE = os.path.join(DATABASE_DIR, DATABASE_FILE_NAME) PYTHON_DIR = os.path.join(REPO_DIR, 'archivebox') LEGACY_DIR = os.path.join(PYTHON_DIR, 'legacy') @@ -221,6 +223,12 @@ def find_chrome_data_dir() -> Optional[str]: return None +def setup_django(): + import django + sys.path.append(PYTHON_DIR) + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') + django.setup() + # ****************************************************************************** # ************************ Environment & Dependencies ************************** # ****************************************************************************** diff --git a/archivebox/legacy/index.py b/archivebox/legacy/index.py index 4df15e30..173d6b7c 100644 --- a/archivebox/legacy/index.py +++ b/archivebox/legacy/index.py @@ -6,6 +6,8 @@ from collections import OrderedDict from .schema import Link, ArchiveResult from .config import ( + DATABASE_DIR, + DATABASE_FILE_NAME, OUTPUT_DIR, TIMEOUT, URL_BLACKLIST_PTN, @@ -19,6 +21,10 @@ from .storage.json import ( parse_json_link_details, write_json_link_details, ) +from .storage.sql import ( + write_sql_main_index, + parse_sql_main_index, +) from .util import ( scheme, enforce_types, @@ -204,6 +210,14 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool= log_indexing_process_started() + log_indexing_started(DATABASE_DIR, DATABASE_FILE_NAME) + timer = TimedProgress(TIMEOUT * 2, prefix=' ') + try: + write_sql_main_index(links) + finally: + timer.end() + log_indexing_finished(DATABASE_DIR, DATABASE_FILE_NAME) + log_indexing_started(out_dir, 'index.json') timer = TimedProgress(TIMEOUT * 2, prefix=' ') try: @@ -228,6 +242,8 @@ def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> existing_links: List[Link] = [] if out_dir: existing_links = list(parse_json_main_index(out_dir)) + existing_sql_links = list(parse_sql_main_index()) + assert set(l.url for l in existing_links) == set(l['url'] for l in existing_sql_links) new_links: List[Link] = [] if import_path: diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py index c437d5d4..72e949ad 100644 --- a/archivebox/legacy/main.py +++ b/archivebox/legacy/main.py @@ -22,6 +22,7 @@ from .config import ( DATABASE_DIR, check_dependencies, check_data_folder, + setup_django, ) from .logs import ( log_archiving_started, @@ -75,6 +76,11 @@ def init(): write_main_index([], out_dir=OUTPUT_DIR, finished=True) + setup_django() + from django.core.management import call_command + call_command("makemigrations", interactive=False) + call_command("migrate", interactive=False) + stderr('{green}[√] Done.{reset}'.format(**ANSI)) diff --git a/archivebox/legacy/mypy_django.ini b/archivebox/legacy/mypy_django.ini new file mode 100644 index 00000000..306e567c --- /dev/null +++ b/archivebox/legacy/mypy_django.ini @@ -0,0 +1,10 @@ +[mypy_django_plugin] + +# specify settings module to use for django.conf.settings, this setting +# could also be specified with DJANGO_SETTINGS_MODULE environment variable +# (it also takes priority over config file) +django_settings = core.settings + +# if True, all unknown settings in django.conf.settings will fallback to Any, +# specify it if your settings are loaded dynamically to avoid false positives +ignore_missing_settings = True diff --git a/archivebox/legacy/storage/sql.py b/archivebox/legacy/storage/sql.py new file mode 100644 index 00000000..c4f03bb0 --- /dev/null +++ b/archivebox/legacy/storage/sql.py @@ -0,0 +1,32 @@ +__package__ = 'archivebox.legacy.storage' + +from typing import List, Iterator + +from ..schema import Link +from ..util import enforce_types +from ..config import setup_django + + +### Main Links Index + +sql_keys = ('url', 'timestamp', 'title', 'tags', 'updated') + + +@enforce_types +def parse_sql_main_index() -> Iterator[Link]: + setup_django() + from core.models import Page + + return ( + page.as_json(*sql_keys) + for page in Page.objects.all() + ) + +@enforce_types +def write_sql_main_index(links: List[Link]) -> None: + setup_django() + from core.models import Page + + for link in links: + info = {k: v for k, v in link._asdict().items() if k in sql_keys} + Page.objects.update_or_create(url=link.url, defaults=info) diff --git a/archivebox/mypy.ini b/archivebox/mypy.ini new file mode 100644 index 00000000..b1b4489a --- /dev/null +++ b/archivebox/mypy.ini @@ -0,0 +1,3 @@ +[mypy] +plugins = + mypy_django_plugin.main diff --git a/archivebox/tests.py b/archivebox/tests.py index 80096e8a..6afb6c7d 100755 --- a/archivebox/tests.py +++ b/archivebox/tests.py @@ -2,6 +2,7 @@ __package__ = 'archivebox' + import os import sys import shutil diff --git a/requirements.txt b/requirements.txt index eb9861dd..d7b43bc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ base32-crockford setuptools ipdb mypy +django-stubs flake8 #wpull diff --git a/setup.py b/setup.py index b6137740..1c048d8a 100644 --- a/setup.py +++ b/setup.py @@ -36,9 +36,10 @@ setuptools.setup( packages=setuptools.find_packages(), python_requires='>=3.6', install_requires=[ + "dataclasses==0.6", "base32-crockford==0.3.0", "django==2.2", - "dataclasses==0.6", + "django-extensions==2.1.6", ], entry_points={ 'console_scripts': [