ArchiveBox/archivebox/parsers/__init__.py

"""
Everything related to parsing links from input sources.

For a list of supported services, see the README.md.
For examples of supported import formats see tests/.
"""

__package__ = 'archivebox.parsers'

import re
from io import StringIO

from typing import IO, Tuple, List, Optional
from datetime import datetime
from pathlib import Path 

from ..system import atomic_write
from ..config import (
    ANSI,
    OUTPUT_DIR,
    SOURCES_DIR_NAME,
    TIMEOUT,
)
from ..util import (
    basename,
    htmldecode,
    download_url,
    enforce_types,
    URL_REGEX,
)
from ..index.schema import Link
from ..logging_util import TimedProgress, log_source_saved

from .pocket_html import parse_pocket_html_export
from .pinboard_rss import parse_pinboard_rss_export
from .wallabag_atom import parse_wallabag_atom_export
from .shaarli_rss import parse_shaarli_rss_export
from .medium_rss import parse_medium_rss_export
from .netscape_html import parse_netscape_html_export
from .generic_rss import parse_generic_rss_export
from .generic_json import parse_generic_json_export
from .generic_html import parse_generic_html_export
from .generic_txt import parse_generic_txt_export

PARSERS = (
    # Specialized parsers
    ('Wallabag ATOM', parse_wallabag_atom_export),
    ('Pocket HTML', parse_pocket_html_export),
    ('Pinboard RSS', parse_pinboard_rss_export),
    ('Shaarli RSS', parse_shaarli_rss_export),
    ('Medium RSS', parse_medium_rss_export),
    
    # General parsers
    ('Netscape HTML', parse_netscape_html_export),
    ('Generic RSS', parse_generic_rss_export),
    ('Generic JSON', parse_generic_json_export),
    ('Generic HTML', parse_generic_html_export),

    # Fallback parser
    ('Plain Text', parse_generic_txt_export),
)


@enforce_types
def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
    """
    parse a list of URLS without touching the filesystem
    """
    check_url_parsing_invariants()

    timer = TimedProgress(TIMEOUT * 4)
    #urls = list(map(lambda x: x + "\n", urls))
    file = StringIO()
    file.writelines(urls)
    file.name = "io_string"
    links, parser = run_parser_functions(file, timer, root_url=root_url)
    timer.end()

    if parser is None:
        return [], 'Failed to parse'
    return links, parser
    

@enforce_types
def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]:
    """parse a list of URLs with their metadata from an 
       RSS feed, bookmarks export, or text file
    """

    check_url_parsing_invariants()

    timer = TimedProgress(TIMEOUT * 4)
    with open(source_file, 'r', encoding='utf-8') as file:
        links, parser = run_parser_functions(file, timer, root_url=root_url)

    timer.end()
    if parser is None:
        return [], 'Failed to parse'
    return links, parser


def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]:
    most_links: List[Link] = []
    best_parser_name = None

    for parser_name, parser_func in PARSERS:
        try:
            parsed_links = list(parser_func(to_parse, root_url=root_url))
            if not parsed_links:
                raise Exception('no links found')

            # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
            if len(parsed_links) > len(most_links):
                most_links = parsed_links
                best_parser_name = parser_name
                
        except Exception as err:                                                # noqa
            # Parsers are tried one by one down the list, and the first one
            # that succeeds is used. To see why a certain parser was not used
            # due to error or format incompatibility, uncomment this line:
            
            # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
            # raise
            pass
    timer.end()
    return most_links, best_parser_name


@enforce_types
def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:
    ts = str(datetime.now().timestamp()).split('.', 1)[0]
    source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))
    atomic_write(source_path, raw_text)
    log_source_saved(source_file=source_path)
    return source_path


@enforce_types
def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: Path=OUTPUT_DIR) -> str:
    """download a given url's content into output/sources/domain-<timestamp>.txt"""
    ts = str(datetime.now().timestamp()).split('.', 1)[0]
    source_path = str(OUTPUT_DIR / SOURCES_DIR_NAME / filename.format(basename=basename(path), ts=ts))

    if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
        # Source is a URL that needs to be downloaded
        print(f'    > Downloading {path} contents')
        timer = TimedProgress(timeout, prefix='      ')
        try:
            raw_source_text = download_url(path, timeout=timeout)
            raw_source_text = htmldecode(raw_source_text)
            timer.end()
        except Exception as e:
            timer.end()
            print('{}[!] Failed to download {}{}\n'.format(
                ANSI['red'],
                path,
                ANSI['reset'],
            ))
            print('    ', e)
            raise SystemExit(1)

    else:
        # Source is a path to a local file on the filesystem
        with open(path, 'r') as f:
            raw_source_text = f.read()

    atomic_write(source_path, raw_source_text)

    log_source_saved(source_file=source_path)

    return source_path


def check_url_parsing_invariants() -> None:
    """Check that plain text regex URL parsing works as expected"""

    # this is last-line-of-defense to make sure the URL_REGEX isn't
    # misbehaving, as the consequences could be disastrous and lead to many
    # incorrect/badly parsed links being added to the archive

    test_urls = '''
    https://example1.com/what/is/happening.html?what=1#how-about-this=1
    https://example2.com/what/is/happening/?what=1#how-about-this=1
    HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
    https://example4.com/what/is/happening.html
    https://example5.com/
    https://example6.com

    <test>http://example7.com</test>
    [https://example8.com/what/is/this.php?what=1]
    [and http://example9.com?what=1&other=3#and-thing=2]
    <what>https://example10.com#and-thing=2 "</about>
    abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
    sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
    example13.bada
    and example14.badb
    <or>htt://example15.badc</that>
    '''
    # print('\n'.join(re.findall(URL_REGEX, test_urls)))
    assert len(re.findall(URL_REGEX, test_urls)) == 12
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`"""`
			`Everything related to parsing links from input sources.`

			`For a list of supported services, see the README.md.`
			`For examples of supported import formats see tests/.`
			`"""`

			`__package__ = 'archivebox.parsers'`

split up utils into separate files 2019-05-01 03:13:04 +00:00			`import re`
feat: Initial oneshot command proposal 2020-07-29 16:19:06 +00:00			`from io import StringIO`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`from typing import IO, Tuple, List, Optional`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`from datetime import datetime`
first attempt to migrate to Pathlib 2020-09-03 22:26:49 +00:00			`from pathlib import Path`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
split up utils into separate files 2019-05-01 03:13:04 +00:00			`from ..system import atomic_write`
			`from ..config import (`
			`ANSI,`
			`OUTPUT_DIR,`
			`SOURCES_DIR_NAME,`
			`TIMEOUT,`
			`)`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`from ..util import (`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`basename,`
htmldecode downloaded sources before parsing for links 2020-08-18 12:23:20 +00:00			`htmldecode,`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`download_url,`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`enforce_types,`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`URL_REGEX,`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`)`
Merge branch 'master' into django 2020-06-26 01:30:29 +00:00			`from ..index.schema import Link`
refactor: Organize code to remove flake8 issues 2020-07-24 17:25:25 +00:00			`from ..logging_util import TimedProgress, log_source_saved`
htmldecode downloaded sources before parsing for links 2020-08-18 12:23:20 +00:00
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`from .pocket_html import parse_pocket_html_export`
			`from .pinboard_rss import parse_pinboard_rss_export`
Add parser for Wallabag Atom feeds 2020-10-18 09:20:07 +00:00			`from .wallabag_atom import parse_wallabag_atom_export`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`from .shaarli_rss import parse_shaarli_rss_export`
			`from .medium_rss import parse_medium_rss_export`
			`from .netscape_html import parse_netscape_html_export`
			`from .generic_rss import parse_generic_rss_export`
			`from .generic_json import parse_generic_json_export`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`from .generic_html import parse_generic_html_export`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`from .generic_txt import parse_generic_txt_export`

feat: Initial oneshot command proposal 2020-07-29 16:19:06 +00:00			`PARSERS = (`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`# Specialized parsers`
Add parser for Wallabag Atom feeds 2020-10-18 09:20:07 +00:00			`('Wallabag ATOM', parse_wallabag_atom_export),`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`('Pocket HTML', parse_pocket_html_export),`
			`('Pinboard RSS', parse_pinboard_rss_export),`
			`('Shaarli RSS', parse_shaarli_rss_export),`
			`('Medium RSS', parse_medium_rss_export),`

			`# General parsers`
			`('Netscape HTML', parse_netscape_html_export),`
			`('Generic RSS', parse_generic_rss_export),`
			`('Generic JSON', parse_generic_json_export),`
			`('Generic HTML', parse_generic_html_export),`

			`# Fallback parser`
			`('Plain Text', parse_generic_txt_export),`
			`)`

feat: Initial oneshot command proposal 2020-07-29 16:19:06 +00:00
			`@enforce_types`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`def parse_links_memory(urls: List[str], root_url: Optional[str]=None):`
feat: Initial oneshot command proposal 2020-07-29 16:19:06 +00:00			`"""`
			`parse a list of URLS without touching the filesystem`
			`"""`
			`check_url_parsing_invariants()`

			`timer = TimedProgress(TIMEOUT * 4)`
			`#urls = list(map(lambda x: x + "\n", urls))`
			`file = StringIO()`
			`file.writelines(urls)`
			`file.name = "io_string"`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`links, parser = run_parser_functions(file, timer, root_url=root_url)`
feat: Initial oneshot command proposal 2020-07-29 16:19:06 +00:00			`timer.end()`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00
			`if parser is None:`
			`return [], 'Failed to parse'`
			`return links, parser`
feat: Initial oneshot command proposal 2020-07-29 16:19:06 +00:00

			`@enforce_types`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]:`
feat: Initial oneshot command proposal 2020-07-29 16:19:06 +00:00			`"""parse a list of URLs with their metadata from an`
			`RSS feed, bookmarks export, or text file`
			`"""`

			`check_url_parsing_invariants()`

move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`timer = TimedProgress(TIMEOUT * 4)`
			`with open(source_file, 'r', encoding='utf-8') as file:`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`links, parser = run_parser_functions(file, timer, root_url=root_url)`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`timer.end()`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`if parser is None:`
			`return [], 'Failed to parse'`
			`return links, parser`


			`def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]:`
			`most_links: List[Link] = []`
			`best_parser_name = None`
split up utils into separate files 2019-05-01 03:13:04 +00:00
feat: Initial oneshot command proposal 2020-07-29 16:19:06 +00:00			`for parser_name, parser_func in PARSERS:`
			`try:`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`parsed_links = list(parser_func(to_parse, root_url=root_url))`
			`if not parsed_links:`
			`raise Exception('no links found')`

			`# print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')`
			`if len(parsed_links) > len(most_links):`
			`most_links = parsed_links`
			`best_parser_name = parser_name`

			`except Exception as err: # noqa`
feat: Initial oneshot command proposal 2020-07-29 16:19:06 +00:00			`# Parsers are tried one by one down the list, and the first one`
			`# that succeeds is used. To see why a certain parser was not used`
			`# due to error or format incompatibility, uncomment this line:`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00
feat: Initial oneshot command proposal 2020-07-29 16:19:06 +00:00			`# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))`
			`# raise`
new generic_html parser for extracting hrefs 2020-08-18 12:29:05 +00:00			`pass`
			`timer.end()`
			`return most_links, best_parser_name`
feat: Initial oneshot command proposal 2020-07-29 16:19:06 +00:00
split up utils into separate files 2019-05-01 03:13:04 +00:00
			`@enforce_types`
first attempt to migrate to Pathlib 2020-09-03 22:26:49 +00:00			`def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`ts = str(datetime.now().timestamp()).split('.', 1)[0]`
Replaced os.path in init parsers 2020-09-30 19:09:34 +00:00			`source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))`
Merge branch 'master' into django 2020-06-26 01:30:29 +00:00			`atomic_write(source_path, raw_text)`
fix depth flag and tweak logging 2020-07-13 15:26:30 +00:00			`log_source_saved(source_file=source_path)`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`return source_path`


			`@enforce_types`
first attempt to migrate to Pathlib 2020-09-03 22:26:49 +00:00			`def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: Path=OUTPUT_DIR) -> str:`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`"""download a given url's content into output/sources/domain-<timestamp>.txt"""`
			`ts = str(datetime.now().timestamp()).split('.', 1)[0]`
Replaced os.path in init parsers 2020-09-30 19:09:34 +00:00			`source_path = str(OUTPUT_DIR / SOURCES_DIR_NAME / filename.format(basename=basename(path), ts=ts))`
split up utils into separate files 2019-05-01 03:13:04 +00:00
			`if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):`
Merge branch 'master' into django 2020-06-26 01:30:29 +00:00			`# Source is a URL that needs to be downloaded`
htmldecode downloaded sources before parsing for links 2020-08-18 12:23:20 +00:00			`print(f' > Downloading {path} contents')`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`timer = TimedProgress(timeout, prefix=' ')`
			`try:`
			`raw_source_text = download_url(path, timeout=timeout)`
htmldecode downloaded sources before parsing for links 2020-08-18 12:23:20 +00:00			`raw_source_text = htmldecode(raw_source_text)`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`timer.end()`
			`except Exception as e:`
			`timer.end()`
			`print('{}[!] Failed to download {}{}\n'.format(`
			`ANSI['red'],`
			`path,`
			`ANSI['reset'],`
			`))`
			`print(' ', e)`
			`raise SystemExit(1)`

			`else:`
Merge branch 'master' into django 2020-06-26 01:30:29 +00:00			`# Source is a path to a local file on the filesystem`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`with open(path, 'r') as f:`
			`raw_source_text = f.read()`

Merge branch 'master' into django 2020-06-26 01:30:29 +00:00			`atomic_write(source_path, raw_source_text)`
split up utils into separate files 2019-05-01 03:13:04 +00:00
fix depth flag and tweak logging 2020-07-13 15:26:30 +00:00			`log_source_saved(source_file=source_path)`
split up utils into separate files 2019-05-01 03:13:04 +00:00
			`return source_path`


			`def check_url_parsing_invariants() -> None:`
			`"""Check that plain text regex URL parsing works as expected"""`

			`# this is last-line-of-defense to make sure the URL_REGEX isn't`
			`# misbehaving, as the consequences could be disastrous and lead to many`
			`# incorrect/badly parsed links being added to the archive`

			`test_urls = '''`
			`https://example1.com/what/is/happening.html?what=1#how-about-this=1`
			`https://example2.com/what/is/happening/?what=1#how-about-this=1`
			`HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f`
			`https://example4.com/what/is/happening.html`
			`https://example5.com/`
			`https://example6.com`

			`<test>http://example7.com</test>`
			`[https://example8.com/what/is/this.php?what=1]`
			`[and http://example9.com?what=1&other=3#and-thing=2]`
			`<what>https://example10.com#and-thing=2 "</about>`
			`abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def`
			`sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi`
			`example13.bada`
			`and example14.badb`
			`<or>htt://example15.badc</that>`
			`'''`
			`# print('\n'.join(re.findall(URL_REGEX, test_urls)))`
			`assert len(re.findall(URL_REGEX, test_urls)) == 12`