ArchiveBox/archivebox/index/json.py

__package__ = 'archivebox.index'

import os
import sys
import json as pyjson

from datetime import datetime
from typing import List, Optional, Iterator, Any

from .schema import Link, ArchiveResult
from ..system import atomic_write
from ..util import enforce_types
from ..config import (
    VERSION,
    OUTPUT_DIR,
    FOOTER_INFO,
    GIT_SHA,
    DEPENDENCIES,
    JSON_INDEX_FILENAME,
    ARCHIVE_DIR_NAME,
)


MAIN_INDEX_HEADER = {
    'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
    'schema': 'archivebox.index.json',
    'copyright_info': FOOTER_INFO,
    'meta': {
        'project': 'ArchiveBox',
        'version': VERSION,
        'git_sha': GIT_SHA,
        'website': 'https://ArchiveBox.io',
        'docs': 'https://github.com/pirate/ArchiveBox/wiki',
        'source': 'https://github.com/pirate/ArchiveBox',
        'issues': 'https://github.com/pirate/ArchiveBox/issues',
        'dependencies': DEPENDENCIES,
    },
}


### Main Links Index

@enforce_types
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
    """parse an archive index json file and return the list of links"""

    index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
            links = pyjson.load(f)['links']
            for link_json in links:
                yield Link.from_json(link_json)

    return ()

@enforce_types
def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
    """write the json link index to a given path"""

    assert isinstance(links, List), 'Links must be a list, not a generator.'
    assert not links or isinstance(links[0].history, dict)
    assert not links or isinstance(links[0].sources, list)

    if links and links[0].history.get('title'):
        assert isinstance(links[0].history['title'][0], ArchiveResult)

    if links and links[0].sources:
        assert isinstance(links[0].sources[0], str)

    main_index_json = {
        **MAIN_INDEX_HEADER,
        'num_links': len(links),
        'updated': datetime.now(),
        'last_run_cmd': sys.argv,
        'links': links,
    }
    atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))


### Link Details Index

@enforce_types
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
    """write a json file with some info about the link"""
    
    out_dir = out_dir or link.link_dir
    path = os.path.join(out_dir, JSON_INDEX_FILENAME)

    atomic_write(link._asdict(extended=True), path)


@enforce_types
def parse_json_link_details(out_dir: str) -> Optional[Link]:
    """load the json link index from a given directory"""
    existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
    if os.path.exists(existing_index):
        with open(existing_index, 'r', encoding='utf-8') as f:
            try:
                link_json = pyjson.load(f)
                return Link.from_json(link_json)
            except pyjson.JSONDecodeError:
                pass
    return None


@enforce_types
def parse_json_links_details(out_dir: str) -> Iterator[Link]:
    """read through all the archive data folders and return the parsed links"""

    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
        if entry.is_dir(follow_symlinks=True):
            if os.path.exists(os.path.join(entry.path, 'index.json')):
                link = parse_json_link_details(entry.path)
                if link:
                    yield link


### Helpers

class ExtendedEncoder(pyjson.JSONEncoder):
    """
    Extended json serializer that supports serializing several model
    fields and objects
    """

    def default(self, obj):
        cls_name = obj.__class__.__name__

        if hasattr(obj, '_asdict'):
            return obj._asdict()

        elif isinstance(obj, bytes):
            return obj.decode()

        elif isinstance(obj, datetime):
            return obj.isoformat()

        elif isinstance(obj, Exception):
            return '{}: {}'.format(obj.__class__.__name__, obj)

        elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
            return tuple(obj)

        return pyjson.JSONEncoder.default(self, obj)


@enforce_types
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
    return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`__package__ = 'archivebox.index'`
add package headers 2019-04-17 22:00:54 -04:00
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`import os`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`import sys`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`import json as pyjson`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
			`from datetime import datetime`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`from typing import List, Optional, Iterator, Any`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`from .schema import Link, ArchiveResult`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`from ..system import atomic_write`
			`from ..util import enforce_types`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`from ..config import (`
			`VERSION,`
			`OUTPUT_DIR,`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`FOOTER_INFO,`
			`GIT_SHA,`
			`DEPENDENCIES,`
			`JSON_INDEX_FILENAME,`
add functions to parse link details jsons and list+apply migrations 2019-04-24 04:07:46 -04:00			`ARCHIVE_DIR_NAME,`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`)`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`MAIN_INDEX_HEADER = {`
			`'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`'schema': 'archivebox.index.json',`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`'copyright_info': FOOTER_INFO,`
			`'meta': {`
			`'project': 'ArchiveBox',`
			`'version': VERSION,`
			`'git_sha': GIT_SHA,`
			`'website': 'https://ArchiveBox.io',`
			`'docs': 'https://github.com/pirate/ArchiveBox/wiki',`
			`'source': 'https://github.com/pirate/ArchiveBox',`
			`'issues': 'https://github.com/pirate/ArchiveBox/issues',`
			`'dependencies': DEPENDENCIES,`
			`},`
			`}`

better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
			`### Main Links Index`

			`@enforce_types`
			`def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`"""parse an archive index json file and return the list of links"""`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`if os.path.exists(index_path):`
			`with open(index_path, 'r', encoding='utf-8') as f:`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`links = pyjson.load(f)['links']`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`for link_json in links:`
			`yield Link.from_json(link_json)`

			`return ()`

			`@enforce_types`
			`def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:`
			`"""write the json link index to a given path"""`

			`assert isinstance(links, List), 'Links must be a list, not a generator.'`
			`assert not links or isinstance(links[0].history, dict)`
			`assert not links or isinstance(links[0].sources, list)`

			`if links and links[0].history.get('title'):`
			`assert isinstance(links[0].history['title'][0], ArchiveResult)`

			`if links and links[0].sources:`
			`assert isinstance(links[0].sources[0], str)`

add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`main_index_json = {`
			`**MAIN_INDEX_HEADER,`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`'num_links': len(links),`
			`'updated': datetime.now(),`
add archivebox info command to scan data dir 2019-04-22 14:34:30 -04:00			`'last_run_cmd': sys.argv,`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`'links': links,`
			`}`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00

			`### Link Details Index`

			`@enforce_types`
			`def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:`
			`"""write a json file with some info about the link"""`

			`out_dir = out_dir or link.link_dir`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`path = os.path.join(out_dir, JSON_INDEX_FILENAME)`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
			`atomic_write(link._asdict(extended=True), path)`


			`@enforce_types`
			`def parse_json_link_details(out_dir: str) -> Optional[Link]:`
			`"""load the json link index from a given directory"""`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`if os.path.exists(existing_index):`
			`with open(existing_index, 'r', encoding='utf-8') as f:`
ignore json parsing errors when loading link jsons 2019-04-24 11:38:13 -04:00			`try:`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`link_json = pyjson.load(f)`
ignore json parsing errors when loading link jsons 2019-04-24 11:38:13 -04:00			`return Link.from_json(link_json)`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`except pyjson.JSONDecodeError:`
ignore json parsing errors when loading link jsons 2019-04-24 11:38:13 -04:00			`pass`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`return None`
add functions to parse link details jsons and list+apply migrations 2019-04-24 04:07:46 -04:00
split up utils into separate files 2019-04-30 23:13:04 -04:00
add functions to parse link details jsons and list+apply migrations 2019-04-24 04:07:46 -04:00			`@enforce_types`
			`def parse_json_links_details(out_dir: str) -> Iterator[Link]:`
			`"""read through all the archive data folders and return the parsed links"""`

			`for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):`
			`if entry.is_dir(follow_symlinks=True):`
			`if os.path.exists(os.path.join(entry.path, 'index.json')):`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`link = parse_json_link_details(entry.path)`
			`if link:`
			`yield link`
split up utils into separate files 2019-04-30 23:13:04 -04:00


			`### Helpers`

			`class ExtendedEncoder(pyjson.JSONEncoder):`
			`"""`
			`Extended json serializer that supports serializing several model`
			`fields and objects`
			`"""`

			`def default(self, obj):`
			`cls_name = obj.__class__.__name__`

			`if hasattr(obj, '_asdict'):`
			`return obj._asdict()`

			`elif isinstance(obj, bytes):`
			`return obj.decode()`

			`elif isinstance(obj, datetime):`
			`return obj.isoformat()`

			`elif isinstance(obj, Exception):`
			`return '{}: {}'.format(obj.__class__.__name__, obj)`

			`elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):`
			`return tuple(obj)`

			`return pyjson.JSONEncoder.default(self, obj)`


			`@enforce_types`
			`def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:`
			`return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)`