mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 06:34:16 +00:00
use KEY, NAME, and PARSER to define parsers instead of hardcoding in init
This commit is contained in:
parent
36f0646501
commit
8ce93ff787
13 changed files with 88 additions and 28 deletions
|
@ -31,39 +31,41 @@ from ..util import (
|
|||
from ..index.schema import Link
|
||||
from ..logging_util import TimedProgress, log_source_saved
|
||||
|
||||
from .pocket_html import parse_pocket_html_export
|
||||
from .pocket_api import parse_pocket_api_export
|
||||
from .pinboard_rss import parse_pinboard_rss_export
|
||||
from .wallabag_atom import parse_wallabag_atom_export
|
||||
from .shaarli_rss import parse_shaarli_rss_export
|
||||
from .medium_rss import parse_medium_rss_export
|
||||
from .netscape_html import parse_netscape_html_export
|
||||
from .generic_rss import parse_generic_rss_export
|
||||
from .generic_json import parse_generic_json_export
|
||||
from .generic_html import parse_generic_html_export
|
||||
from .generic_txt import parse_generic_txt_export
|
||||
from .url_list import parse_url_list
|
||||
from . import pocket_api
|
||||
from . import wallabag_atom
|
||||
from . import pocket_html
|
||||
from . import pinboard_rss
|
||||
from . import shaarli_rss
|
||||
from . import medium_rss
|
||||
|
||||
from . import netscape_html
|
||||
from . import generic_rss
|
||||
from . import generic_json
|
||||
from . import generic_html
|
||||
from . import generic_txt
|
||||
from . import url_list
|
||||
|
||||
|
||||
PARSERS = {
|
||||
# Specialized parsers
|
||||
'pocket-api': ('Pocket API', parse_pocket_api_export),
|
||||
'wallabag': ('Wallabag ATOM', parse_wallabag_atom_export),
|
||||
'pocket-html': ('Pocket HTML', parse_pocket_html_export),
|
||||
'pinboard-rss': ('Pinboard RSS', parse_pinboard_rss_export),
|
||||
'shaarli-rss': ('Shaarli RSS', parse_shaarli_rss_export),
|
||||
'medium-rss': ('Medium RSS', parse_medium_rss_export),
|
||||
|
||||
# General parsers
|
||||
'netscape-html': ('Netscape HTML', parse_netscape_html_export),
|
||||
'rss': ('Generic RSS', parse_generic_rss_export),
|
||||
'json': ('Generic JSON', parse_generic_json_export),
|
||||
'html': ('Generic HTML', parse_generic_html_export),
|
||||
pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
|
||||
wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
|
||||
pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
|
||||
pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),
|
||||
shaarli_rss.KEY: (shaarli_rss.NAME, shaarli_rss.PARSER),
|
||||
medium_rss.KEY: (medium_rss.NAME, medium_rss.PARSER),
|
||||
|
||||
# Fallback parser
|
||||
'plain-text': ('Plain Text', parse_generic_txt_export),
|
||||
# General parsers
|
||||
netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER),
|
||||
generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER),
|
||||
generic_json.KEY: (generic_json.NAME, generic_json.PARSER),
|
||||
generic_html.KEY: (generic_html.NAME, generic_html.PARSER),
|
||||
|
||||
# Catchall fallback parser
|
||||
generic_txt.KEY: (generic_txt.NAME, generic_txt.PARSER),
|
||||
|
||||
# Explicitly specified parsers
|
||||
'url-list': ('URL list', parse_url_list),
|
||||
url_list.KEY: (url_list.NAME, url_list.PARSER),
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -51,3 +51,8 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
|||
tags=None,
|
||||
sources=[html_file.name],
|
||||
)
|
||||
|
||||
|
||||
KEY = 'html'
|
||||
NAME = 'Generic HTML'
|
||||
PARSER = parse_generic_html_export
|
||||
|
|
|
@ -63,3 +63,8 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
tags=htmldecode(link.get('tags')) or '',
|
||||
sources=[json_file.name],
|
||||
)
|
||||
|
||||
|
||||
KEY = 'json'
|
||||
NAME = 'Generic JSON'
|
||||
PARSER = parse_generic_json_export
|
||||
|
|
|
@ -47,3 +47,8 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
||||
KEY = 'rss'
|
||||
NAME = 'Generic RSS'
|
||||
PARSER = parse_generic_rss_export
|
||||
|
|
|
@ -59,3 +59,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
tags=None,
|
||||
sources=[text_file.name],
|
||||
)
|
||||
|
||||
KEY = 'txt'
|
||||
NAME = 'Generic TXT'
|
||||
PARSER = parse_generic_txt_export
|
||||
|
|
|
@ -33,3 +33,8 @@ def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
||||
KEY = 'medium_rss'
|
||||
NAME = 'Medium RSS'
|
||||
PARSER = parse_medium_rss_export
|
||||
|
|
|
@ -37,3 +37,7 @@ def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
sources=[html_file.name],
|
||||
)
|
||||
|
||||
|
||||
KEY = 'netscape_html'
|
||||
NAME = 'Netscape HTML'
|
||||
PARSER = parse_netscape_html_export
|
||||
|
|
|
@ -45,3 +45,8 @@ def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
tags=htmldecode(tags) or None,
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
||||
KEY = 'pinboard_rss'
|
||||
NAME = 'Pinboard RSS'
|
||||
PARSER = parse_pinboard_rss_export
|
||||
|
|
|
@ -111,3 +111,8 @@ def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
yield link_from_article(article, sources=[line])
|
||||
|
||||
write_since(username, api.last_since)
|
||||
|
||||
|
||||
KEY = 'pocket_api'
|
||||
NAME = 'Pocket API'
|
||||
PARSER = parse_pocket_api_export
|
||||
|
|
|
@ -36,3 +36,8 @@ def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
tags=tags or '',
|
||||
sources=[html_file.name],
|
||||
)
|
||||
|
||||
|
||||
KEY = 'pocket_html'
|
||||
NAME = 'Pocket HTML'
|
||||
PARSER = parse_pocket_html_export
|
||||
|
|
|
@ -48,3 +48,8 @@ def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
||||
KEY = 'shaarli_rss'
|
||||
NAME = 'Shaarli RSS'
|
||||
PARSER = parse_shaarli_rss_export
|
||||
|
|
|
@ -17,7 +17,7 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
text_file.seek(0)
|
||||
for line in text_file.readlines():
|
||||
url = line.strip()
|
||||
if len(url) == 0:
|
||||
if not url:
|
||||
continue
|
||||
|
||||
yield Link(
|
||||
|
@ -27,3 +27,8 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
tags=None,
|
||||
sources=[text_file.name],
|
||||
)
|
||||
|
||||
|
||||
KEY = 'url_list'
|
||||
NAME = 'URL List'
|
||||
PARSER = parse_url_list
|
||||
|
|
|
@ -55,3 +55,8 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
tags=tags or '',
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
||||
KEY = 'wallabag_atom'
|
||||
NAME = 'Wallabag Atom'
|
||||
PARSER = parse_wallabag_atom_export
|
||||
|
|
Loading…
Reference in a new issue