mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2025-02-16 13:28:29 +00:00
add urls log to Crawl model
This commit is contained in:
parent
28386ff172
commit
b948e49013
6 changed files with 68 additions and 28 deletions
|
@ -120,7 +120,8 @@ def cli(ctx, help=False):
|
||||||
def main(args=None, prog_name=None):
|
def main(args=None, prog_name=None):
|
||||||
# show `docker run archivebox xyz` in help messages if running in docker
|
# show `docker run archivebox xyz` in help messages if running in docker
|
||||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||||
prog_name = prog_name or ('docker compose run archivebox' if IN_DOCKER else 'archivebox')
|
IS_TTY = sys.stdin.isatty()
|
||||||
|
prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cli(args=args, prog_name=prog_name)
|
cli(args=args, prog_name=prog_name)
|
||||||
|
|
|
@ -15,9 +15,7 @@ from django.db.models import QuerySet
|
||||||
from archivebox.misc.util import enforce_types, docstring
|
from archivebox.misc.util import enforce_types, docstring
|
||||||
from archivebox import CONSTANTS
|
from archivebox import CONSTANTS
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG
|
from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
from archivebox.config.django import setup_django
|
|
||||||
from archivebox.config.permissions import USER, HOSTNAME
|
from archivebox.config.permissions import USER, HOSTNAME
|
||||||
from archivebox.misc.checks import check_data_folder
|
|
||||||
from archivebox.parsers import PARSERS
|
from archivebox.parsers import PARSERS
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,12 +44,8 @@ def add(urls: str | list[str],
|
||||||
depth = int(depth)
|
depth = int(depth)
|
||||||
|
|
||||||
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||||
|
|
||||||
# 0. setup abx, django, check_data_folder
|
|
||||||
setup_django()
|
|
||||||
check_data_folder()
|
|
||||||
|
|
||||||
# then import models once django is set up
|
# import models once django is set up
|
||||||
from crawls.models import Seed, Crawl
|
from crawls.models import Seed, Crawl
|
||||||
from workers.orchestrator import Orchestrator
|
from workers.orchestrator import Orchestrator
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
|
|
|
@ -204,8 +204,13 @@ class Snapshot(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
|
||||||
|
|
||||||
if not self.timestamp:
|
if not self.timestamp:
|
||||||
self.timestamp = str(self.bookmarked_at.timestamp())
|
self.timestamp = str(self.bookmarked_at.timestamp())
|
||||||
|
|
||||||
super().save(*args, **kwargs)
|
super().save(*args, **kwargs)
|
||||||
|
|
||||||
|
# make sure the crawl has this url in its urls log
|
||||||
|
if self.crawl and self.url not in self.crawl.urls:
|
||||||
|
self.crawl.urls += f'\n{self.url}'
|
||||||
|
self.crawl.save()
|
||||||
|
|
||||||
def archive(self, overwrite=False, methods=None):
|
def archive(self, overwrite=False, methods=None):
|
||||||
result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
|
result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
|
||||||
|
@ -713,7 +718,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
|
||||||
"""Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend"""
|
"""Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend"""
|
||||||
super().write_indexes()
|
super().write_indexes()
|
||||||
self.save_search_index()
|
self.save_search_index()
|
||||||
|
# self.save_outlinks_to_crawl()
|
||||||
|
|
||||||
|
# def save_outlinks_to_crawl(self):
|
||||||
|
# """Save the output of this ArchiveResult to the Crawl's urls field"""
|
||||||
|
# if self.output_urls:
|
||||||
|
# self.snapshot.crawl.urls += f'\n{self.url}'
|
||||||
|
# self.snapshot.crawl.save()
|
||||||
|
|
||||||
# def migrate_output_dir(self):
|
# def migrate_output_dir(self):
|
||||||
# """Move the output files to the new folder structure if needed"""
|
# """Move the output files to the new folder structure if needed"""
|
||||||
# print(f'{self}.migrate_output_dir()')
|
# print(f'{self}.migrate_output_dir()')
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
__package__ = 'archivebox.core'
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
from django.utils import timezone
|
||||||
import inspect
|
import inspect
|
||||||
from typing import Callable, get_type_hints
|
from typing import Callable, get_type_hints
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -21,19 +23,18 @@ from admin_data_views.typing import TableContext, ItemContext
|
||||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||||
|
|
||||||
import archivebox
|
import archivebox
|
||||||
|
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
|
||||||
from core.models import Snapshot
|
|
||||||
from core.forms import AddLinkForm
|
|
||||||
|
|
||||||
from workers.tasks import bg_add
|
|
||||||
|
|
||||||
from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION
|
|
||||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
|
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
|
||||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
||||||
from archivebox.misc.serve_static import serve_static_with_byterange_support
|
from archivebox.misc.serve_static import serve_static_with_byterange_support
|
||||||
from archivebox.misc.logging_util import printable_filesize
|
from archivebox.misc.logging_util import printable_filesize
|
||||||
from archivebox.search import query_search_index
|
from archivebox.search import query_search_index
|
||||||
|
|
||||||
|
from core.models import Snapshot
|
||||||
|
from core.forms import AddLinkForm
|
||||||
|
from crawls.models import Seed, Crawl
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class HomepageView(View):
|
class HomepageView(View):
|
||||||
def get(self, request):
|
def get(self, request):
|
||||||
|
@ -450,16 +451,14 @@ class AddView(UserPassesTestMixin, FormView):
|
||||||
}
|
}
|
||||||
|
|
||||||
def form_valid(self, form):
|
def form_valid(self, form):
|
||||||
from core.admin_archiveresults import result_url
|
urls = form.cleaned_data["url"]
|
||||||
|
print(f'[+] Adding URL: {urls}')
|
||||||
url = form.cleaned_data["url"]
|
|
||||||
print(f'[+] Adding URL: {url}')
|
|
||||||
parser = form.cleaned_data["parser"]
|
parser = form.cleaned_data["parser"]
|
||||||
tag = form.cleaned_data["tag"]
|
tag = form.cleaned_data["tag"]
|
||||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||||
extractors = ','.join(form.cleaned_data["archive_methods"])
|
extractors = ','.join(form.cleaned_data["archive_methods"])
|
||||||
input_kwargs = {
|
input_kwargs = {
|
||||||
"urls": url,
|
"urls": urls,
|
||||||
"tag": tag,
|
"tag": tag,
|
||||||
"depth": depth,
|
"depth": depth,
|
||||||
"parser": parser,
|
"parser": parser,
|
||||||
|
@ -470,17 +469,50 @@ class AddView(UserPassesTestMixin, FormView):
|
||||||
if extractors:
|
if extractors:
|
||||||
input_kwargs.update({"extractors": extractors})
|
input_kwargs.update({"extractors": extractors})
|
||||||
|
|
||||||
result = bg_add(input_kwargs, parent_task_id=None)
|
|
||||||
print('Started background add job:', result)
|
from archivebox.config.permissions import HOSTNAME
|
||||||
|
|
||||||
|
|
||||||
|
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
||||||
|
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
|
||||||
|
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||||
|
|
||||||
|
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
||||||
|
seed = Seed.from_file(
|
||||||
|
sources_file,
|
||||||
|
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}',
|
||||||
|
parser=parser,
|
||||||
|
tag=tag,
|
||||||
|
created_by=self.request.user.pk,
|
||||||
|
config={
|
||||||
|
# 'ONLY_NEW': not update,
|
||||||
|
# 'INDEX_ONLY': index_only,
|
||||||
|
# 'OVERWRITE': False,
|
||||||
|
'DEPTH': depth,
|
||||||
|
'EXTRACTORS': parser,
|
||||||
|
# 'DEFAULT_PERSONA': persona or 'Default',
|
||||||
|
})
|
||||||
|
# 3. create a new Crawl pointing to the Seed
|
||||||
|
crawl = Crawl.from_seed(seed, max_depth=depth)
|
||||||
|
|
||||||
|
# 4. start the Orchestrator & wait until it completes
|
||||||
|
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
|
||||||
|
# from crawls.actors import CrawlActor
|
||||||
|
# from core.actors import SnapshotActor, ArchiveResultActor
|
||||||
|
|
||||||
|
|
||||||
rough_url_count = url.count('://')
|
rough_url_count = urls.count('://')
|
||||||
|
|
||||||
messages.success(
|
messages.success(
|
||||||
self.request,
|
self.request,
|
||||||
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a few minutes to see results) {result_url(result)}"),
|
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
|
||||||
)
|
)
|
||||||
|
# if not bg:
|
||||||
|
# from workers.orchestrator import Orchestrator
|
||||||
|
# orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
|
||||||
|
# orchestrator.start()
|
||||||
|
|
||||||
return redirect("/admin/core/snapshot/")
|
return redirect(crawl.admin_change_url)
|
||||||
|
|
||||||
|
|
||||||
class HealthCheckView(View):
|
class HealthCheckView(View):
|
||||||
|
|
|
@ -70,7 +70,7 @@ class CrawlAdmin(ABIDModelAdmin):
|
||||||
search_fields = ('abid', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'seed__abid', 'schedule_id', 'schedule__abid', 'status', 'seed__uri')
|
search_fields = ('abid', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'seed__abid', 'schedule_id', 'schedule__abid', 'status', 'seed__uri')
|
||||||
|
|
||||||
readonly_fields = ('created_at', 'modified_at', 'abid_info', 'snapshots', 'seed_contents')
|
readonly_fields = ('created_at', 'modified_at', 'abid_info', 'snapshots', 'seed_contents')
|
||||||
fields = ('label', 'notes', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields)
|
fields = ('label', 'notes', 'urls', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields)
|
||||||
|
|
||||||
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
|
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
|
||||||
ordering = ['-created_at', '-retry_at']
|
ordering = ['-created_at', '-retry_at']
|
||||||
|
|
|
@ -225,6 +225,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
|
||||||
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
||||||
|
|
||||||
seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
|
seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
|
||||||
|
urls = models.TextField(blank=True, null=False, default='', help_text='The log of URLs discovered in this crawl')
|
||||||
|
|
||||||
label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl')
|
label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl')
|
||||||
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
|
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
|
||||||
|
@ -304,7 +305,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
|
||||||
return Snapshot.objects.get(crawl=self, url=self.seed.uri)
|
return Snapshot.objects.get(crawl=self, url=self.seed.uri)
|
||||||
except Snapshot.DoesNotExist:
|
except Snapshot.DoesNotExist:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
root_snapshot, _ = Snapshot.objects.update_or_create(
|
root_snapshot, _ = Snapshot.objects.update_or_create(
|
||||||
crawl=self,
|
crawl=self,
|
||||||
url=self.seed.uri,
|
url=self.seed.uri,
|
||||||
|
|
Loading…
Add table
Reference in a new issue