ArchiveBox/archivebox/parsers/generic_json.py

__package__ = 'archivebox.parsers'

import json

from typing import IO, Iterable
from datetime import datetime, timezone

from ..index.schema import Link
from archivebox.misc.util import (
    htmldecode,
    enforce_types,
)

# This gets used by generic_jsonl, too
def jsonObjectToLink(link: str, source: str):
    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')

    # example line
    # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
    # Parse URL
    url = link.get('href') or link.get('url') or link.get('URL')
    if not url:
        raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')

    # Parse the timestamp
    ts_str = str(datetime.now(timezone.utc).timestamp())
    if link.get('timestamp'):
        # chrome/ff histories use a very precise timestamp
        ts_str = str(link['timestamp'] / 10000000)
    elif link.get('time'):
        ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
    elif link.get('created_at'):
        ts_str = str(json_date(link['created_at']).timestamp())
    elif link.get('created'):
        ts_str = str(json_date(link['created']).timestamp())
    elif link.get('date'):
        ts_str = str(json_date(link['date']).timestamp())
    elif link.get('bookmarked'):
        ts_str = str(json_date(link['bookmarked']).timestamp())
    elif link.get('saved'):
        ts_str = str(json_date(link['saved']).timestamp())

    # Parse the title
    title = None
    if link.get('title'):
        title = link['title'].strip()
    elif link.get('description'):
        title = link['description'].replace(' — Readability', '').strip()
    elif link.get('name'):
        title = link['name'].strip()

    # if we have a list, join it with commas
    tags = link.get('tags')
    if type(tags) == list:
        tags = ','.join(tags)
    elif type(tags) == str:
        # if there's no comma, assume it was space-separated
        if ',' not in tags:
            tags = tags.replace(' ', ',')

    return Link(
        url=htmldecode(url),
        timestamp=ts_str,
        title=htmldecode(title) or None,
        tags=htmldecode(tags),
        sources=[source],
    )

@enforce_types
def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""

    json_file.seek(0)

    links = json.load(json_file)
    if type(links) != list:
        raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
    
    for link in links:
        if link:
            yield jsonObjectToLink(link, json_file.name)

KEY = 'json'
NAME = 'Generic JSON'
PARSER = parse_generic_json_export
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`__package__ = 'archivebox.parsers'`

			`import json`

			`from typing import IO, Iterable`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`from datetime import datetime, timezone`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`from ..index.schema import Link`
move util.py into misc folder 2024-10-01 00:25:15 +00:00			`from archivebox.misc.util import (`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`htmldecode,`
			`enforce_types,`
			`)`

Add generic_jsonl parser Resolves #1369 2024-03-01 02:15:06 +00:00			`# This gets used by generic_jsonl, too`
			`def jsonObjectToLink(link: str, source: str):`
			`json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')`

			`# example line`
			`# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]`
			`# Parse URL`
			`url = link.get('href') or link.get('url') or link.get('URL')`
			`if not url:`
			`raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')`

			`# Parse the timestamp`
			`ts_str = str(datetime.now(timezone.utc).timestamp())`
			`if link.get('timestamp'):`
			`# chrome/ff histories use a very precise timestamp`
			`ts_str = str(link['timestamp'] / 10000000)`
			`elif link.get('time'):`
			`ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())`
			`elif link.get('created_at'):`
			`ts_str = str(json_date(link['created_at']).timestamp())`
			`elif link.get('created'):`
			`ts_str = str(json_date(link['created']).timestamp())`
			`elif link.get('date'):`
			`ts_str = str(json_date(link['date']).timestamp())`
			`elif link.get('bookmarked'):`
			`ts_str = str(json_date(link['bookmarked']).timestamp())`
			`elif link.get('saved'):`
			`ts_str = str(json_date(link['saved']).timestamp())`

			`# Parse the title`
			`title = None`
			`if link.get('title'):`
			`title = link['title'].strip()`
			`elif link.get('description'):`
			`title = link['description'].replace(' — Readability', '').strip()`
			`elif link.get('name'):`
			`title = link['name'].strip()`

			`# if we have a list, join it with commas`
			`tags = link.get('tags')`
			`if type(tags) == list:`
			`tags = ','.join(tags)`
			`elif type(tags) == str:`
			`# if there's no comma, assume it was space-separated`
			`if ',' not in tags:`
			`tags = tags.replace(' ', ',')`

			`return Link(`
			`url=htmldecode(url),`
			`timestamp=ts_str,`
			`title=htmldecode(title) or None,`
			`tags=htmldecode(tags),`
			`sources=[source],`
			`)`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`@enforce_types`
make all parsers accept arbitrary meta kwargs 2020-08-18 12:27:47 +00:00			`def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""`

			`json_file.seek(0)`
only start parsing json after the first open brace 2023-09-04 04:40:12 +00:00
fix lint errors 2024-04-26 04:36:11 +00:00			`links = json.load(json_file)`
			`if type(links) != list:`
			`raise Exception('JSON parser expects list of objects, maybe this is JSONL?')`

move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`for link in links:`
			`if link:`
fix lint errors 2024-04-26 04:36:11 +00:00			`yield jsonObjectToLink(link, json_file.name)`
use KEY, NAME, and PARSER to define parsers instead of hardcoding in init 2021-03-31 05:05:49 +00:00
			`KEY = 'json'`
			`NAME = 'Generic JSON'`
			`PARSER = parse_generic_json_export`