2020-08-07 13:05:17 +00:00
__package__ = ' archivebox.extractors '
from pathlib import Path
2020-08-10 18:15:28 +00:00
from tempfile import NamedTemporaryFile
2020-08-07 13:05:17 +00:00
from typing import Optional
import json
2024-10-01 00:13:55 +00:00
from archivebox . misc . system import run , atomic_write
2024-10-01 00:25:15 +00:00
from archivebox . misc . util import enforce_types , is_static_file
2024-10-15 04:50:47 +00:00
from . . index . schema import Link , ArchiveResult , ArchiveError
2020-08-07 13:05:17 +00:00
from . . logging_util import TimedProgress
2022-02-08 15:17:52 +00:00
from . title import get_html
2020-08-07 13:05:17 +00:00
2024-10-15 04:50:47 +00:00
from plugins_extractor . readability . config import READABILITY_CONFIG
from plugins_extractor . readability . binaries import READABILITY_BINARY
2024-05-12 05:28:59 +00:00
def get_output_path ( ) :
return ' readability/ '
def get_embed_path ( archiveresult = None ) :
return get_output_path ( ) + ' content.html '
2020-08-07 13:05:17 +00:00
@enforce_types
2021-01-21 21:45:11 +00:00
def should_save_readability ( link : Link , out_dir : Optional [ str ] = None , overwrite : Optional [ bool ] = False ) - > bool :
2024-09-25 12:10:09 +00:00
2020-08-11 12:40:55 +00:00
if is_static_file ( link . url ) :
return False
2020-08-07 13:05:17 +00:00
2024-09-25 12:10:09 +00:00
output_subdir = ( Path ( out_dir or link . link_dir ) / get_output_path ( ) )
if not overwrite and output_subdir . exists ( ) :
2021-01-21 21:45:11 +00:00
return False
2024-09-25 12:10:09 +00:00
return READABILITY_CONFIG . SAVE_READABILITY
2020-08-07 13:05:17 +00:00
@enforce_types
2024-09-25 12:10:09 +00:00
def save_readability ( link : Link , out_dir : Optional [ str ] = None , timeout : int = 0 ) - > ArchiveResult :
2020-08-07 13:05:17 +00:00
""" download reader friendly version using @mozilla/readability """
2024-09-25 12:10:09 +00:00
READABILITY_BIN = READABILITY_BINARY . load ( )
assert READABILITY_BIN . abspath and READABILITY_BIN . version
timeout = timeout or READABILITY_CONFIG . READABILITY_TIMEOUT
output_subdir = Path ( out_dir or link . link_dir ) . absolute ( ) / get_output_path ( )
2024-05-12 05:28:59 +00:00
output = get_output_path ( )
2020-08-07 13:05:17 +00:00
2020-08-10 18:17:55 +00:00
# Readability Docs: https://github.com/mozilla/readability
2020-08-07 13:05:17 +00:00
status = ' succeeded '
2020-09-01 23:42:22 +00:00
# fake command to show the user so they have something to try debugging if get_html fails
2020-09-01 15:16:24 +00:00
cmd = [
2024-09-25 12:10:09 +00:00
str ( READABILITY_BIN . abspath ) ,
' { dom,singlefile}.html ' ,
link . url ,
2020-09-01 15:16:24 +00:00
]
2020-11-17 23:42:57 +00:00
readability_content = None
2020-08-07 13:05:17 +00:00
timer = TimedProgress ( timeout , prefix = ' ' )
try :
2024-09-25 12:10:09 +00:00
document = get_html ( link , Path ( out_dir or link . link_dir ) )
2020-08-17 13:34:40 +00:00
temp_doc = NamedTemporaryFile ( delete = False )
temp_doc . write ( document . encode ( " utf-8 " ) )
temp_doc . close ( )
2021-02-16 20:53:11 +00:00
if not document or len ( document ) < 10 :
raise ArchiveError ( ' Readability could not find HTML to parse for article text ' )
2020-08-17 13:34:40 +00:00
cmd = [
2024-09-25 12:10:09 +00:00
str ( READABILITY_BIN . abspath ) ,
2021-02-16 20:53:11 +00:00
temp_doc . name ,
2021-05-13 04:13:32 +00:00
link . url ,
2020-08-17 13:34:40 +00:00
]
2024-09-25 12:10:09 +00:00
result = run ( cmd , cwd = out_dir , timeout = timeout , text = True )
2021-02-16 20:53:11 +00:00
try :
result_json = json . loads ( result . stdout )
2023-10-20 11:14:28 +00:00
assert result_json and ' content ' in result_json , ' Readability output is not valid JSON '
2021-02-16 20:53:11 +00:00
except json . JSONDecodeError :
2024-01-04 04:09:31 +00:00
raise ArchiveError ( ' Readability was not able to archive the page (invalid JSON) ' , result . stdout + result . stderr )
2021-02-16 20:53:11 +00:00
2024-09-25 12:10:09 +00:00
output_subdir . mkdir ( exist_ok = True )
2020-11-17 23:42:57 +00:00
readability_content = result_json . pop ( " textContent " )
2024-09-25 12:10:09 +00:00
atomic_write ( str ( output_subdir / " content.html " ) , result_json . pop ( " content " ) )
atomic_write ( str ( output_subdir / " content.txt " ) , readability_content )
atomic_write ( str ( output_subdir / " article.json " ) , result_json )
2020-08-07 13:05:17 +00:00
output_tail = [
line . strip ( )
2024-09-25 12:10:09 +00:00
for line in ( result . stdout + result . stderr ) . rsplit ( ' \n ' , 5 ) [ - 5 : ]
2020-08-07 13:05:17 +00:00
if line . strip ( )
]
hints = (
' Got readability response code: {} . ' . format ( result . returncode ) ,
* output_tail ,
)
# Check for common failure cases
if ( result . returncode > 0 ) :
2024-01-04 04:09:31 +00:00
raise ArchiveError ( f ' Readability was not able to archive the page (status= { result . returncode } ) ' , hints )
2020-08-18 23:09:41 +00:00
except ( Exception , OSError ) as err :
2020-08-07 13:05:17 +00:00
status = ' failed '
output = err
2024-01-04 03:00:19 +00:00
# prefer Chrome dom output to singlefile because singlefile often contains huge url(data:image/...base64) strings that make the html too long to parse with readability
2024-01-03 03:50:56 +00:00
cmd = [ cmd [ 0 ] , ' ./ { dom,singlefile}.html ' ]
2020-08-07 13:05:17 +00:00
finally :
timer . end ( )
return ArchiveResult (
cmd = cmd ,
2020-08-11 13:36:03 +00:00
pwd = str ( out_dir ) ,
2024-09-25 12:10:09 +00:00
cmd_version = str ( READABILITY_BIN . version ) ,
2020-08-11 13:58:49 +00:00
output = output ,
2020-08-07 13:05:17 +00:00
status = status ,
2021-02-16 20:53:11 +00:00
index_texts = [ readability_content ] if readability_content else [ ] ,
2020-11-17 23:42:57 +00:00
* * timer . stats ,
2020-08-07 13:05:17 +00:00
)