2020-08-07 13:05:17 +00:00
__package__ = ' archivebox.extractors '
from pathlib import Path
2020-08-10 18:15:28 +00:00
from tempfile import NamedTemporaryFile
2020-08-07 13:05:17 +00:00
from typing import Optional
import json
from . . index . schema import Link , ArchiveResult , ArchiveError
from . . system import run , atomic_write
from . . util import (
enforce_types ,
2020-08-11 12:40:55 +00:00
is_static_file ,
2020-08-07 13:05:17 +00:00
)
from . . config import (
TIMEOUT ,
2020-09-01 15:16:24 +00:00
CURL_BINARY ,
2020-08-07 13:05:17 +00:00
SAVE_READABILITY ,
2020-08-18 22:40:19 +00:00
DEPENDENCIES ,
2020-08-07 13:05:17 +00:00
READABILITY_VERSION ,
)
from . . logging_util import TimedProgress
2022-02-08 15:17:52 +00:00
from . title import get_html
2020-08-07 13:05:17 +00:00
2024-05-12 05:28:59 +00:00
def get_output_path ( ) :
return ' readability/ '
def get_embed_path ( archiveresult = None ) :
return get_output_path ( ) + ' content.html '
2020-08-07 13:05:17 +00:00
@enforce_types
2021-01-21 21:45:11 +00:00
def should_save_readability ( link : Link , out_dir : Optional [ str ] = None , overwrite : Optional [ bool ] = False ) - > bool :
2020-08-11 12:40:55 +00:00
if is_static_file ( link . url ) :
return False
2020-08-07 13:05:17 +00:00
2021-01-21 21:45:11 +00:00
out_dir = out_dir or Path ( link . link_dir )
2024-05-12 05:28:59 +00:00
if not overwrite and ( out_dir / get_output_path ( ) ) . exists ( ) :
2021-01-21 21:45:11 +00:00
return False
return SAVE_READABILITY
2020-08-07 13:05:17 +00:00
@enforce_types
def save_readability ( link : Link , out_dir : Optional [ str ] = None , timeout : int = TIMEOUT ) - > ArchiveResult :
""" download reader friendly version using @mozilla/readability """
2020-08-11 13:36:03 +00:00
out_dir = Path ( out_dir or link . link_dir )
2024-05-12 05:28:59 +00:00
output_folder = out_dir . absolute ( ) / get_output_path ( )
output = get_output_path ( )
2020-08-07 13:05:17 +00:00
2020-08-10 18:17:55 +00:00
# Readability Docs: https://github.com/mozilla/readability
2020-08-07 13:05:17 +00:00
status = ' succeeded '
2020-09-01 23:42:22 +00:00
# fake command to show the user so they have something to try debugging if get_html fails
2020-09-01 15:16:24 +00:00
cmd = [
CURL_BINARY ,
link . url
]
2020-11-17 23:42:57 +00:00
readability_content = None
2020-08-07 13:05:17 +00:00
timer = TimedProgress ( timeout , prefix = ' ' )
try :
2020-08-17 13:34:40 +00:00
document = get_html ( link , out_dir )
temp_doc = NamedTemporaryFile ( delete = False )
temp_doc . write ( document . encode ( " utf-8 " ) )
temp_doc . close ( )
2021-02-16 20:53:11 +00:00
if not document or len ( document ) < 10 :
raise ArchiveError ( ' Readability could not find HTML to parse for article text ' )
2020-08-17 13:34:40 +00:00
cmd = [
2020-08-18 22:40:19 +00:00
DEPENDENCIES [ ' READABILITY_BINARY ' ] [ ' path ' ] ,
2021-02-16 20:53:11 +00:00
temp_doc . name ,
2021-05-13 04:13:32 +00:00
link . url ,
2020-08-17 13:34:40 +00:00
]
2020-08-07 13:05:17 +00:00
result = run ( cmd , cwd = out_dir , timeout = timeout )
2021-02-16 20:53:11 +00:00
try :
result_json = json . loads ( result . stdout )
2023-10-20 11:14:28 +00:00
assert result_json and ' content ' in result_json , ' Readability output is not valid JSON '
2021-02-16 20:53:11 +00:00
except json . JSONDecodeError :
2024-01-04 04:09:31 +00:00
raise ArchiveError ( ' Readability was not able to archive the page (invalid JSON) ' , result . stdout + result . stderr )
2021-02-16 20:53:11 +00:00
2020-08-10 18:15:28 +00:00
output_folder . mkdir ( exist_ok = True )
2020-11-17 23:42:57 +00:00
readability_content = result_json . pop ( " textContent " )
2020-08-10 18:15:28 +00:00
atomic_write ( str ( output_folder / " content.html " ) , result_json . pop ( " content " ) )
2020-11-17 23:42:57 +00:00
atomic_write ( str ( output_folder / " content.txt " ) , readability_content )
2020-08-10 18:15:28 +00:00
atomic_write ( str ( output_folder / " article.json " ) , result_json )
2020-08-07 13:05:17 +00:00
output_tail = [
line . strip ( )
2023-10-20 11:14:28 +00:00
for line in ( result . stdout + result . stderr ) . decode ( ) . rsplit ( ' \n ' , 5 ) [ - 5 : ]
2020-08-07 13:05:17 +00:00
if line . strip ( )
]
hints = (
' Got readability response code: {} . ' . format ( result . returncode ) ,
* output_tail ,
)
# Check for common failure cases
if ( result . returncode > 0 ) :
2024-01-04 04:09:31 +00:00
raise ArchiveError ( f ' Readability was not able to archive the page (status= { result . returncode } ) ' , hints )
2020-08-18 23:09:41 +00:00
except ( Exception , OSError ) as err :
2020-08-07 13:05:17 +00:00
status = ' failed '
output = err
2024-01-04 03:00:19 +00:00
# prefer Chrome dom output to singlefile because singlefile often contains huge url(data:image/...base64) strings that make the html too long to parse with readability
2024-01-03 03:50:56 +00:00
cmd = [ cmd [ 0 ] , ' ./ { dom,singlefile}.html ' ]
2020-08-07 13:05:17 +00:00
finally :
timer . end ( )
return ArchiveResult (
cmd = cmd ,
2020-08-11 13:36:03 +00:00
pwd = str ( out_dir ) ,
2020-08-07 13:05:17 +00:00
cmd_version = READABILITY_VERSION ,
2020-08-11 13:58:49 +00:00
output = output ,
2020-08-07 13:05:17 +00:00
status = status ,
2021-02-16 20:53:11 +00:00
index_texts = [ readability_content ] if readability_content else [ ] ,
2020-11-17 23:42:57 +00:00
* * timer . stats ,
2020-08-07 13:05:17 +00:00
)