use KEY, NAME, and PARSER to define parsers instead of hardcoding in init

This commit is contained in:
Nick Sweeting 2021-03-31 01:05:49 -04:00
parent 36f0646501
commit 8ce93ff787
13 changed files with 88 additions and 28 deletions

View file

@ -31,39 +31,41 @@ from ..util import (
from ..index.schema import Link
from ..logging_util import TimedProgress, log_source_saved
from .pocket_html import parse_pocket_html_export
from .pocket_api import parse_pocket_api_export
from .pinboard_rss import parse_pinboard_rss_export
from .wallabag_atom import parse_wallabag_atom_export
from .shaarli_rss import parse_shaarli_rss_export
from .medium_rss import parse_medium_rss_export
from .netscape_html import parse_netscape_html_export
from .generic_rss import parse_generic_rss_export
from .generic_json import parse_generic_json_export
from .generic_html import parse_generic_html_export
from .generic_txt import parse_generic_txt_export
from .url_list import parse_url_list
from . import pocket_api
from . import wallabag_atom
from . import pocket_html
from . import pinboard_rss
from . import shaarli_rss
from . import medium_rss
from . import netscape_html
from . import generic_rss
from . import generic_json
from . import generic_html
from . import generic_txt
from . import url_list
PARSERS = {
# Specialized parsers
'pocket-api': ('Pocket API', parse_pocket_api_export),
'wallabag': ('Wallabag ATOM', parse_wallabag_atom_export),
'pocket-html': ('Pocket HTML', parse_pocket_html_export),
'pinboard-rss': ('Pinboard RSS', parse_pinboard_rss_export),
'shaarli-rss': ('Shaarli RSS', parse_shaarli_rss_export),
'medium-rss': ('Medium RSS', parse_medium_rss_export),
# General parsers
'netscape-html': ('Netscape HTML', parse_netscape_html_export),
'rss': ('Generic RSS', parse_generic_rss_export),
'json': ('Generic JSON', parse_generic_json_export),
'html': ('Generic HTML', parse_generic_html_export),
pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),
shaarli_rss.KEY: (shaarli_rss.NAME, shaarli_rss.PARSER),
medium_rss.KEY: (medium_rss.NAME, medium_rss.PARSER),
# Fallback parser
'plain-text': ('Plain Text', parse_generic_txt_export),
# General parsers
netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER),
generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER),
generic_json.KEY: (generic_json.NAME, generic_json.PARSER),
generic_html.KEY: (generic_html.NAME, generic_html.PARSER),
# Catchall fallback parser
generic_txt.KEY: (generic_txt.NAME, generic_txt.PARSER),
# Explicitly specified parsers
'url-list': ('URL list', parse_url_list),
url_list.KEY: (url_list.NAME, url_list.PARSER),
}

View file

@ -51,3 +51,8 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
tags=None,
sources=[html_file.name],
)
KEY = 'html'
NAME = 'Generic HTML'
PARSER = parse_generic_html_export

View file

@ -63,3 +63,8 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
tags=htmldecode(link.get('tags')) or '',
sources=[json_file.name],
)
KEY = 'json'
NAME = 'Generic JSON'
PARSER = parse_generic_json_export

View file

@ -47,3 +47,8 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
tags=None,
sources=[rss_file.name],
)
KEY = 'rss'
NAME = 'Generic RSS'
PARSER = parse_generic_rss_export

View file

@ -59,3 +59,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
tags=None,
sources=[text_file.name],
)
KEY = 'txt'
NAME = 'Generic TXT'
PARSER = parse_generic_txt_export

View file

@ -33,3 +33,8 @@ def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
tags=None,
sources=[rss_file.name],
)
KEY = 'medium_rss'
NAME = 'Medium RSS'
PARSER = parse_medium_rss_export

View file

@ -37,3 +37,7 @@ def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
sources=[html_file.name],
)
KEY = 'netscape_html'
NAME = 'Netscape HTML'
PARSER = parse_netscape_html_export

View file

@ -45,3 +45,8 @@ def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
tags=htmldecode(tags) or None,
sources=[rss_file.name],
)
KEY = 'pinboard_rss'
NAME = 'Pinboard RSS'
PARSER = parse_pinboard_rss_export

View file

@ -111,3 +111,8 @@ def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
yield link_from_article(article, sources=[line])
write_since(username, api.last_since)
KEY = 'pocket_api'
NAME = 'Pocket API'
PARSER = parse_pocket_api_export

View file

@ -36,3 +36,8 @@ def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
tags=tags or '',
sources=[html_file.name],
)
KEY = 'pocket_html'
NAME = 'Pocket HTML'
PARSER = parse_pocket_html_export

View file

@ -48,3 +48,8 @@ def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
tags=None,
sources=[rss_file.name],
)
KEY = 'shaarli_rss'
NAME = 'Shaarli RSS'
PARSER = parse_shaarli_rss_export

View file

@ -17,7 +17,7 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
text_file.seek(0)
for line in text_file.readlines():
url = line.strip()
if len(url) == 0:
if not url:
continue
yield Link(
@ -27,3 +27,8 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
tags=None,
sources=[text_file.name],
)
KEY = 'url_list'
NAME = 'URL List'
PARSER = parse_url_list

View file

@ -55,3 +55,8 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
tags=tags or '',
sources=[rss_file.name],
)
KEY = 'wallabag_atom'
NAME = 'Wallabag Atom'
PARSER = parse_wallabag_atom_export