Remove ads and cookie banners from HTML snapshots (#695)

* integrate ublock with single-file

* reuse chromium profile
This commit is contained in:
Sascha Ißbrücker 2024-04-14 13:09:46 +02:00 committed by GitHub
parent 22a1fc80ad
commit 25470edb2c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 89 additions and 10 deletions

3
.gitignore vendored
View file

@ -191,3 +191,6 @@ typings/
/tmp
# Database file
/data
# ublock + chromium
/uBlock0.chromium
/chromium-profile

View file

@ -18,11 +18,12 @@ logger = logging.getLogger(__name__)
def create_snapshot(url: str, filepath: str):
singlefile_path = settings.LD_SINGLEFILE_PATH
# parse string to list of arguments
singlefile_options = shlex.split(settings.LD_SINGLEFILE_OPTIONS)
# parse options to list of arguments
ublock_options = shlex.split(settings.LD_SINGLEFILE_UBLOCK_OPTIONS)
custom_options = shlex.split(settings.LD_SINGLEFILE_OPTIONS)
temp_filepath = filepath + ".tmp"
# concat lists
args = [singlefile_path] + singlefile_options + [url, temp_filepath]
args = [singlefile_path] + ublock_options + custom_options + [url, temp_filepath]
try:
# Use start_new_session=True to create a new process group
process = subprocess.Popen(args, start_new_session=True)

View file

@ -61,6 +61,10 @@ class SingleFileServiceTestCase(TestCase):
expected_args = [
"single-file",
'--browser-arg="--headless=new"',
'--browser-arg="--user-data-dir=./chromium-profile"',
'--browser-arg="--no-sandbox"',
'--browser-arg="--load-extension=uBlock0.chromium"',
"http://example.com",
self.html_filepath + ".tmp",
]
@ -79,6 +83,10 @@ class SingleFileServiceTestCase(TestCase):
expected_args = [
"single-file",
'--browser-arg="--headless=new"',
'--browser-arg="--user-data-dir=./chromium-profile"',
'--browser-arg="--no-sandbox"',
'--browser-arg="--load-extension=uBlock0.chromium"',
"--some-option",
"some value",
"--another-option",
@ -97,9 +105,9 @@ class SingleFileServiceTestCase(TestCase):
with mock.patch("subprocess.Popen", return_value=mock_process):
singlefile.create_snapshot("http://example.com", self.html_filepath)
mock_process.wait.assert_called_with(timeout=60)
mock_process.wait.assert_called_with(timeout=120)
@override_settings(LD_SINGLEFILE_TIMEOUT_SEC=120)
@override_settings(LD_SINGLEFILE_TIMEOUT_SEC=180)
def test_create_snapshot_custom_timeout_setting(self):
mock_process = mock.Mock()
mock_process.wait.return_value = 0
@ -108,4 +116,4 @@ class SingleFileServiceTestCase(TestCase):
with mock.patch("subprocess.Popen", return_value=mock_process):
singlefile.create_snapshot("http://example.com", self.html_filepath)
mock_process.wait.assert_called_with(timeout=120)
mock_process.wait.assert_called_with(timeout=180)

View file

@ -9,6 +9,8 @@ mkdir -p data
mkdir -p data/favicons
# Create assets folder if it does not exist
mkdir -p data/assets
# Create chromium profile folder if it does not exist
mkdir -p chromium-profile
# Generate secret key file if it does not exist
python manage.py generate_secret_key
@ -21,8 +23,9 @@ python manage.py create_initial_superuser
# Migrate legacy background tasks to Huey
python manage.py migrate_tasks
# Ensure the DB folder is owned by the right user
# Ensure folders are owned by the right user
chown -R www-data: /etc/linkding/data
chown -R www-data: /etc/linkding/chromium-profile
# Start background task processor using supervisord, unless explicitly disabled
if [ "$LD_DISABLE_BACKGROUND_TASKS" != "True" ]; then

View file

@ -99,10 +99,29 @@ CMD curl -f http://localhost:${LD_SERVER_PORT:-9090}/${LD_CONTEXT_PATH}health ||
CMD ["./bootstrap.sh"]
FROM node:18-alpine AS ublock-build
WORKDIR /etc/linkding
# Install necessary tools
RUN apk add --no-cache curl jq unzip
# Fetch the latest release tag
# Download the library
# Unzip the library
RUN TAG=$(curl -sL https://api.github.com/repos/gorhill/uBlock/releases/latest | jq -r '.tag_name') && \
DOWNLOAD_URL=https://github.com/gorhill/uBlock/releases/download/$TAG/uBlock0_$TAG.chromium.zip && \
curl -L -o uBlock0.zip $DOWNLOAD_URL && \
unzip uBlock0.zip
# Patch assets.json to enable easylist-cookies by default
RUN curl -L -o ./uBlock0.chromium/assets/thirdparties/easylist/easylist-cookies.txt https://ublockorigin.github.io/uAssets/thirdparties/easylist-cookies.txt
RUN jq '."fanboy-cookiemonster" |= del(.off) | ."fanboy-cookiemonster".contentURL += ["assets/thirdparties/easylist/easylist-cookies.txt"]' ./uBlock0.chromium/assets/assets.json > temp.json && \
mv temp.json ./uBlock0.chromium/assets/assets.json
FROM linkding AS linkding-plus
# install node, chromium
RUN apk update && apk add nodejs npm chromium
# install single-file from fork for now, which contains several hotfixes
RUN npm install -g https://github.com/sissbruecker/single-file-cli/tarball/f3730995a52f27d5041a1ad9e7528af4b6b4cf4b
RUN npm install -g https://github.com/sissbruecker/single-file-cli/tarball/4c54b3bc704cfb3e96cec2d24854caca3df0b3b6
# copy uBlock0
COPY --from=ublock-build /etc/linkding/uBlock0.chromium uBlock0.chromium/
# enable snapshot support
ENV LD_ENABLE_SNAPSHOTS=True

View file

@ -96,6 +96,24 @@ CMD curl -f http://localhost:${LD_SERVER_PORT:-9090}/${LD_CONTEXT_PATH}health ||
CMD ["./bootstrap.sh"]
FROM node:18-alpine AS ublock-build
WORKDIR /etc/linkding
# Install necessary tools
RUN apk add --no-cache curl jq unzip
# Fetch the latest release tag
# Download the library
# Unzip the library
RUN TAG=$(curl -sL https://api.github.com/repos/gorhill/uBlock/releases/latest | jq -r '.tag_name') && \
DOWNLOAD_URL=https://github.com/gorhill/uBlock/releases/download/$TAG/uBlock0_$TAG.chromium.zip && \
curl -L -o uBlock0.zip $DOWNLOAD_URL && \
unzip uBlock0.zip
# Patch assets.json to enable easylist-cookies by default
RUN curl -L -o ./uBlock0.chromium/assets/thirdparties/easylist/easylist-cookies.txt https://ublockorigin.github.io/uAssets/thirdparties/easylist-cookies.txt
RUN jq '."fanboy-cookiemonster" |= del(.off) | ."fanboy-cookiemonster".contentURL += ["assets/thirdparties/easylist/easylist-cookies.txt"]' ./uBlock0.chromium/assets/assets.json > temp.json && \
mv temp.json ./uBlock0.chromium/assets/assets.json
FROM linkding AS linkding-plus
# install chromium
RUN apt-get update && apt-get -y install chromium
@ -106,6 +124,8 @@ RUN apt-get install -y gnupg2 apt-transport-https ca-certificates && \
echo "deb [signed-by=/usr/share/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \
apt-get update && apt-get install -y nodejs
# install single-file from fork for now, which contains several hotfixes
RUN npm install -g https://github.com/sissbruecker/single-file-cli/tarball/f3730995a52f27d5041a1ad9e7528af4b6b4cf4b
RUN npm install -g https://github.com/sissbruecker/single-file-cli/tarball/4c54b3bc704cfb3e96cec2d24854caca3df0b3b6
# copy uBlock0
COPY --from=ublock-build /etc/linkding/uBlock0.chromium uBlock0.chromium/
# enable snapshot support
ENV LD_ENABLE_SNAPSHOTS=True

13
scripts/setup-ublock.sh Executable file
View file

@ -0,0 +1,13 @@
rm -rf ublock0.chromium
TAG=$(curl -sL https://api.github.com/repos/gorhill/uBlock/releases/latest | jq -r '.tag_name')
DOWNLOAD_URL=https://github.com/gorhill/uBlock/releases/download/$TAG/uBlock0_$TAG.chromium.zip
curl -L -o uBlock0.zip $DOWNLOAD_URL
unzip uBlock0.zip
rm uBlock0.zip
curl -L -o ./uBlock0.chromium/assets/thirdparties/easylist/easylist-cookies.txt https://ublockorigin.github.io/uAssets/thirdparties/easylist-cookies.txt
jq '."fanboy-cookiemonster" |= del(.off) | ."fanboy-cookiemonster".contentURL += ["assets/thirdparties/easylist/easylist-cookies.txt"]' ./uBlock0.chromium/assets/assets.json > temp.json
mv temp.json ./uBlock0.chromium/assets/assets.json
mkdir -p chromium-profile

View file

@ -12,6 +12,7 @@ https://docs.djangoproject.com/en/2.2/ref/settings/
import json
import os
import shlex
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@ -294,8 +295,19 @@ LD_ENABLE_SNAPSHOTS = os.getenv("LD_ENABLE_SNAPSHOTS", False) in (
"1",
)
LD_SINGLEFILE_PATH = os.getenv("LD_SINGLEFILE_PATH", "single-file")
LD_SINGLEFILE_UBLOCK_OPTIONS = os.getenv(
"LD_SINGLEFILE_UBLOCK_OPTIONS",
shlex.join(
[
'--browser-arg="--headless=new"',
'--browser-arg="--user-data-dir=./chromium-profile"',
'--browser-arg="--no-sandbox"',
'--browser-arg="--load-extension=uBlock0.chromium"',
]
),
)
LD_SINGLEFILE_OPTIONS = os.getenv("LD_SINGLEFILE_OPTIONS", "")
LD_SINGLEFILE_TIMEOUT_SEC = float(os.getenv("LD_SINGLEFILE_TIMEOUT_SEC", 60))
LD_SINGLEFILE_TIMEOUT_SEC = float(os.getenv("LD_SINGLEFILE_TIMEOUT_SEC", 120))
# Monolith isn't used at the moment, as the local snapshot implementation
# switched to single-file after the prototype. Keeping this around in case