ArchiveBox/archivebox/parsers/generic_txt.py

__package__ = 'archivebox.parsers'
__description__ = 'Plain Text'

from typing import IO, Iterable
from datetime import datetime, timezone

from ..index.schema import Link
from archivebox.misc.util import (
    htmldecode,
    enforce_types,
    find_all_urls,
)


@enforce_types
def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse links from a text file, ignoring other text"""

    text_file.seek(0)
    for line in text_file.readlines():
        if not line.strip():
            continue

        # # if the line is a local file path that resolves, then we can archive it
        # if line.startswith('file://'):    
        #     try:
        #         if Path(line).exists():
        #             yield Link(
        #                 url=line,
        #                 timestamp=str(datetime.now(timezone.utc).timestamp()),
        #                 title=None,
        #                 tags=None,
        #                 sources=[text_file.name],
        #             )
        #     except (OSError, PermissionError):
        #         # nvm, not a valid path...
        #         pass

        # otherwise look for anything that looks like a URL in the line
        for url in find_all_urls(line):
            yield Link(
                url=htmldecode(url),
                timestamp=str(datetime.now(timezone.utc).timestamp()),
                title=None,
                tags=None,
                sources=[text_file.name],
            )


KEY = 'txt'
NAME = 'Generic TXT'
PARSER = parse_generic_txt_export
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`__package__ = 'archivebox.parsers'`
			`__description__ = 'Plain Text'`

			`from typing import IO, Iterable`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`from datetime import datetime, timezone`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`from ..index.schema import Link`
move util.py into misc folder 2024-10-01 00:25:15 +00:00			`from archivebox.misc.util import (`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`htmldecode,`
			`enforce_types,`
replace uses of URL_REGEX with find_all_urls to handle markdown better 2024-04-25 00:45:45 +00:00			`find_all_urls,`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`)`

accept local paths as valid link URLs when parsing 2020-07-13 15:22:58 +00:00
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`@enforce_types`
make all parsers accept arbitrary meta kwargs 2020-08-18 12:27:47 +00:00			`def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:`
add command: --parser option 2021-03-20 16:38:00 +00:00			`"""Parse links from a text file, ignoring other text"""`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`text_file.seek(0)`
			`for line in text_file.readlines():`
accept local paths as valid link URLs when parsing 2020-07-13 15:22:58 +00:00			`if not line.strip():`
			`continue`

switch .is_dir and .exists for os.access to avoid PermissionError on startup 2024-10-08 10:02:34 +00:00			`# # if the line is a local file path that resolves, then we can archive it`
			`# if line.startswith('file://'):`
			`# try:`
			`# if Path(line).exists():`
			`# yield Link(`
			`# url=line,`
			`# timestamp=str(datetime.now(timezone.utc).timestamp()),`
			`# title=None,`
			`# tags=None,`
			`# sources=[text_file.name],`
			`# )`
			`# except (OSError, PermissionError):`
			`# # nvm, not a valid path...`
			`# pass`
accept local paths as valid link URLs when parsing 2020-07-13 15:22:58 +00:00
			`# otherwise look for anything that looks like a URL in the line`
replace uses of URL_REGEX with find_all_urls to handle markdown better 2024-04-25 00:45:45 +00:00			`for url in find_all_urls(line):`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`yield Link(`
			`url=htmldecode(url),`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`timestamp=str(datetime.now(timezone.utc).timestamp()),`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`title=None,`
			`tags=None,`
			`sources=[text_file.name],`
			`)`
also parse and archive sub-urls in generic_txt input 2020-07-27 22:52:02 +00:00
use KEY, NAME, and PARSER to define parsers instead of hardcoding in init 2021-03-31 05:05:49 +00:00
			`KEY = 'txt'`
			`NAME = 'Generic TXT'`
			`PARSER = parse_generic_txt_export`