ArchiveBox/archivebox/parsers/generic_txt.py

__package__ = 'archivebox.parsers'
__description__ = 'Plain Text'

import re

from typing import IO, Iterable
from datetime import datetime, timezone
from pathlib import Path

from ..index.schema import Link
from ..util import (
    htmldecode,
    enforce_types,
    URL_REGEX
)


@enforce_types
def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse links from a text file, ignoring other text"""

    text_file.seek(0)
    for line in text_file.readlines():
        if not line.strip():
            continue

        # if the line is a local file path that resolves, then we can archive it
        try:
            if Path(line).exists():
                yield Link(
                    url=line,
                    timestamp=str(datetime.now(timezone.utc).timestamp()),
                    title=None,
                    tags=None,
                    sources=[text_file.name],
                )
        except (OSError, PermissionError):
            # nvm, not a valid path...
            pass

        # otherwise look for anything that looks like a URL in the line
        for url in re.findall(URL_REGEX, line):
            yield Link(
                url=htmldecode(url),
                timestamp=str(datetime.now(timezone.utc).timestamp()),
                title=None,
                tags=None,
                sources=[text_file.name],
            )

            # look inside the URL for any sub-urls, e.g. for archive.org links
            # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
            # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
            for sub_url in re.findall(URL_REGEX, line[1:]):
                yield Link(
                    url=htmldecode(sub_url),
                    timestamp=str(datetime.now(timezone.utc).timestamp()),
                    title=None,
                    tags=None,
                    sources=[text_file.name],
                )

KEY = 'txt'
NAME = 'Generic TXT'
PARSER = parse_generic_txt_export
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`__package__ = 'archivebox.parsers'`
			`__description__ = 'Plain Text'`

			`import re`

			`from typing import IO, Iterable`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`from datetime import datetime, timezone`
accept local paths as valid link URLs when parsing 2020-07-13 15:22:58 +00:00			`from pathlib import Path`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`from ..index.schema import Link`
			`from ..util import (`
			`htmldecode,`
			`enforce_types,`
			`URL_REGEX`
			`)`

accept local paths as valid link URLs when parsing 2020-07-13 15:22:58 +00:00
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`@enforce_types`
make all parsers accept arbitrary meta kwargs 2020-08-18 12:27:47 +00:00			`def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:`
add command: --parser option 2021-03-20 16:38:00 +00:00			`"""Parse links from a text file, ignoring other text"""`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`text_file.seek(0)`
			`for line in text_file.readlines():`
accept local paths as valid link URLs when parsing 2020-07-13 15:22:58 +00:00			`if not line.strip():`
			`continue`

			`# if the line is a local file path that resolves, then we can archive it`
fix url is too long to be a path error 2020-08-18 12:23:57 +00:00			`try:`
			`if Path(line).exists():`
			`yield Link(`
			`url=line,`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`timestamp=str(datetime.now(timezone.utc).timestamp()),`
fix url is too long to be a path error 2020-08-18 12:23:57 +00:00			`title=None,`
			`tags=None,`
			`sources=[text_file.name],`
			`)`
			`except (OSError, PermissionError):`
			`# nvm, not a valid path...`
			`pass`
accept local paths as valid link URLs when parsing 2020-07-13 15:22:58 +00:00
			`# otherwise look for anything that looks like a URL in the line`
			`for url in re.findall(URL_REGEX, line):`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`yield Link(`
			`url=htmldecode(url),`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`timestamp=str(datetime.now(timezone.utc).timestamp()),`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`title=None,`
			`tags=None,`
			`sources=[text_file.name],`
			`)`
also parse and archive sub-urls in generic_txt input 2020-07-27 22:52:02 +00:00
			`# look inside the URL for any sub-urls, e.g. for archive.org links`
			`# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/`
			`# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/`
fix lgtm errors 2021-01-30 11:07:35 +00:00			`for sub_url in re.findall(URL_REGEX, line[1:]):`
also parse and archive sub-urls in generic_txt input 2020-07-27 22:52:02 +00:00			`yield Link(`
fix lgtm errors 2021-01-30 11:07:35 +00:00			`url=htmldecode(sub_url),`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`timestamp=str(datetime.now(timezone.utc).timestamp()),`
also parse and archive sub-urls in generic_txt input 2020-07-27 22:52:02 +00:00			`title=None,`
			`tags=None,`
			`sources=[text_file.name],`
			`)`
use KEY, NAME, and PARSER to define parsers instead of hardcoding in init 2021-03-31 05:05:49 +00:00
			`KEY = 'txt'`
			`NAME = 'Generic TXT'`
			`PARSER = parse_generic_txt_export`