ArchiveBox/archivebox/parsers/generic_json.py

__package__ = 'archivebox.parsers'

import json

from typing import IO, Iterable
from datetime import datetime, timezone

from ..index.schema import Link
from ..util import (
    htmldecode,
    enforce_types,
)

# This gets used by generic_jsonl, too
def jsonObjectToLink(link: str, source: str):
    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')

    # example line
    # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
    # Parse URL
    url = link.get('href') or link.get('url') or link.get('URL')
    if not url:
        raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')

    # Parse the timestamp
    ts_str = str(datetime.now(timezone.utc).timestamp())
    if link.get('timestamp'):
        # chrome/ff histories use a very precise timestamp
        ts_str = str(link['timestamp'] / 10000000)
    elif link.get('time'):
        ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
    elif link.get('created_at'):
        ts_str = str(json_date(link['created_at']).timestamp())
    elif link.get('created'):
        ts_str = str(json_date(link['created']).timestamp())
    elif link.get('date'):
        ts_str = str(json_date(link['date']).timestamp())
    elif link.get('bookmarked'):
        ts_str = str(json_date(link['bookmarked']).timestamp())
    elif link.get('saved'):
        ts_str = str(json_date(link['saved']).timestamp())

    # Parse the title
    title = None
    if link.get('title'):
        title = link['title'].strip()
    elif link.get('description'):
        title = link['description'].replace(' — Readability', '').strip()
    elif link.get('name'):
        title = link['name'].strip()

    # if we have a list, join it with commas
    tags = link.get('tags')
    if type(tags) == list:
        tags = ','.join(tags)
    elif type(tags) == str:
        # if there's no comma, assume it was space-separated
        if ',' not in tags:
            tags = tags.replace(' ', ',')

    return Link(
        url=htmldecode(url),
        timestamp=ts_str,
        title=htmldecode(title) or None,
        tags=htmldecode(tags),
        sources=[source],
    )

@enforce_types
def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""

    json_file.seek(0)

    try:
        links = json.load(json_file)
        if type(links) != list:
            raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
    except json.decoder.JSONDecodeError:
        # sometimes the first line is a comment or other junk, so try without
        json_file.seek(0)
        first_line = json_file.readline()
        #print('      > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
        links = json.load(json_file)
        # we may fail again, which means we really don't know what to do

    for link in links:
        if link:
            yield jsonObjectToLink(link,json_file.name)

KEY = 'json'
NAME = 'Generic JSON'
PARSER = parse_generic_json_export