mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-12-28 14:49:55 +00:00
Implement hook architecture with JSONL output support
Phase 1: Database migration for new ArchiveResult fields
- Add output_str (TextField) for human-readable summary
- Add output_json (JSONField) for structured metadata
- Add output_files (JSONField) for dict of {relative_path: {}}
- Add output_size (BigIntegerField) for total bytes
- Add output_mimetypes (CharField) for CSV of mimetypes
- Add binary FK to InstalledBinary (optional)
- Migrate existing 'output' field to new split fields
Phase 3: Update run_hook() for JSONL parsing
- Support new JSONL format (any line with {type: 'ModelName', ...})
- Maintain backwards compatibility with RESULT_JSON= format
- Add plugin metadata to each parsed record
- Detect background hooks with .bg. suffix in filename
- Add find_binary_for_cmd() helper function
- Add create_model_record() for processing side-effect records
Phase 6: Update ArchiveResult.run()
- Handle background hooks (return immediately when result is None)
- Process 'records' from HookResult for side-effect models
- Use new output fields (output_str, output_json, output_files, etc.)
- Call create_model_record() for InstalledBinary, Machine updates
Phase 7: Add background hook support
- Add is_background_hook() method to ArchiveResult
- Add check_background_completed() to check if process exited
- Add finalize_background_hook() to collect results from completed hooks
- Update SnapshotMachine.is_finished() to check/finalize background hooks
- Update _populate_output_fields() to walk directory and populate stats
Also updated references to old 'output' field in:
- admin_archiveresults.py
- statemachines.py
- templatetags/core_tags.py
This commit is contained in:
parent
35dd9acafe
commit
3d985fa8c8
@ -47,7 +47,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
|
||||
|
||||
# Truncate output for display
|
||||
full_output = result.output or '-'
|
||||
full_output = result.output_str or '-'
|
||||
output_display = full_output[:60]
|
||||
if len(full_output) > 60:
|
||||
output_display += '...'
|
||||
@ -55,8 +55,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
# Get full command as tooltip
|
||||
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
|
||||
|
||||
# Build output link
|
||||
output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
|
||||
# Build output link - use embed_path() which checks output_files first
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
output_link = f'/archive/{result.snapshot.timestamp}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
|
||||
|
||||
# Get version - try cmd_version field
|
||||
version = result.cmd_version if result.cmd_version else '-'
|
||||
@ -336,27 +337,29 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
|
||||
)
|
||||
|
||||
def output_str(self, result):
|
||||
# Determine output link path - use output if file exists, otherwise link to index
|
||||
output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
|
||||
def output_display(self, result):
|
||||
# Determine output link path - use embed_path() which checks output_files
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
|
||||
return format_html(
|
||||
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
|
||||
result.snapshot.timestamp,
|
||||
output_path,
|
||||
result.output,
|
||||
result.output_str,
|
||||
)
|
||||
|
||||
def output_summary(self, result):
|
||||
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
|
||||
output_str = format_html(
|
||||
output_html = format_html(
|
||||
'<pre style="display: inline-block">{}</pre><br/>',
|
||||
result.output,
|
||||
result.output_str,
|
||||
)
|
||||
output_str += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
|
||||
path_from_output_str = (snapshot_dir / (result.output or ''))
|
||||
output_str += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(result.output))
|
||||
if os.access(path_from_output_str, os.R_OK):
|
||||
root_dir = str(path_from_output_str)
|
||||
output_html += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
|
||||
path_from_embed = (snapshot_dir / (embed_path or ''))
|
||||
output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))
|
||||
if os.access(path_from_embed, os.R_OK):
|
||||
root_dir = str(path_from_embed)
|
||||
else:
|
||||
root_dir = str(snapshot_dir)
|
||||
|
||||
|
||||
80
archivebox/core/migrations/0029_archiveresult_hook_fields.py
Normal file
80
archivebox/core/migrations/0029_archiveresult_hook_fields.py
Normal file
@ -0,0 +1,80 @@
|
||||
# Generated by Django for hook architecture support
|
||||
# Phase 1: Add new ArchiveResult fields for hook output
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0028_snapshot_fs_version'),
|
||||
('machine', '0002_rename_custom_cmds_to_overrides'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Add new output fields (keep old 'output' temporarily for migration)
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(
|
||||
blank=True,
|
||||
default='',
|
||||
help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
|
||||
),
|
||||
),
|
||||
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(
|
||||
null=True,
|
||||
blank=True,
|
||||
default=None,
|
||||
help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
|
||||
),
|
||||
),
|
||||
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(
|
||||
default=dict,
|
||||
help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
|
||||
),
|
||||
),
|
||||
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(
|
||||
default=0,
|
||||
help_text='Total recursive size in bytes of all output files'
|
||||
),
|
||||
),
|
||||
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(
|
||||
max_length=512,
|
||||
blank=True,
|
||||
default='',
|
||||
help_text='CSV of mimetypes sorted by size descending'
|
||||
),
|
||||
),
|
||||
|
||||
# Add binary FK (optional)
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='binary',
|
||||
field=models.ForeignKey(
|
||||
'machine.InstalledBinary',
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name='archiveresults',
|
||||
help_text='Primary binary used by this hook (optional)'
|
||||
),
|
||||
),
|
||||
]
|
||||
64
archivebox/core/migrations/0030_migrate_output_field.py
Normal file
64
archivebox/core/migrations/0030_migrate_output_field.py
Normal file
@ -0,0 +1,64 @@
|
||||
# Generated by Django for hook architecture support
|
||||
# Phase 1: Migrate existing 'output' field to new split fields
|
||||
|
||||
from django.db import migrations
|
||||
import json
|
||||
|
||||
|
||||
def migrate_output_field(apps, schema_editor):
|
||||
"""
|
||||
Migrate existing 'output' field to new split fields.
|
||||
|
||||
Logic:
|
||||
- If output contains JSON {...}, move to output_json
|
||||
- Otherwise, move to output_str
|
||||
"""
|
||||
ArchiveResult = apps.get_model('core', 'ArchiveResult')
|
||||
|
||||
for ar in ArchiveResult.objects.all().iterator():
|
||||
old_output = ar.output or ''
|
||||
|
||||
# Case 1: JSON output
|
||||
if old_output.strip().startswith('{'):
|
||||
try:
|
||||
parsed = json.loads(old_output)
|
||||
ar.output_json = parsed
|
||||
ar.output_str = ''
|
||||
except json.JSONDecodeError:
|
||||
# Not valid JSON, treat as string
|
||||
ar.output_str = old_output
|
||||
|
||||
# Case 2: File path or plain string
|
||||
else:
|
||||
ar.output_str = old_output
|
||||
|
||||
ar.save(update_fields=['output_str', 'output_json'])
|
||||
|
||||
|
||||
def reverse_migrate(apps, schema_editor):
|
||||
"""Reverse migration - copy output_str back to output."""
|
||||
ArchiveResult = apps.get_model('core', 'ArchiveResult')
|
||||
|
||||
for ar in ArchiveResult.objects.all().iterator():
|
||||
if ar.output_json:
|
||||
ar.output = json.dumps(ar.output_json)
|
||||
else:
|
||||
ar.output = ar.output_str or ''
|
||||
ar.save(update_fields=['output'])
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0029_archiveresult_hook_fields'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(migrate_output_field, reverse_migrate),
|
||||
|
||||
# Now safe to remove old 'output' field
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='output',
|
||||
),
|
||||
]
|
||||
@ -36,7 +36,7 @@ from archivebox.base_models.models import (
|
||||
from workers.models import ModelWithStateMachine
|
||||
from workers.tasks import bg_archive_snapshot
|
||||
from crawls.models import Crawl
|
||||
from machine.models import NetworkInterface
|
||||
from machine.models import NetworkInterface, InstalledBinary
|
||||
|
||||
|
||||
|
||||
@ -485,9 +485,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
def calc_icons():
|
||||
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
||||
archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
|
||||
archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
|
||||
else:
|
||||
archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
|
||||
# Filter for results that have either output_files or output_str
|
||||
from django.db.models import Q
|
||||
archive_results = {r.extractor: r for r in self.archiveresult_set.filter(
|
||||
Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
|
||||
)}
|
||||
|
||||
path = self.archive_path
|
||||
canon = self.canonical_outputs()
|
||||
@ -499,7 +503,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
for extractor in all_extractors:
|
||||
result = archive_results.get(extractor)
|
||||
existing = result and result.status == 'succeeded' and result.output
|
||||
existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
|
||||
icon = get_extractor_icon(extractor)
|
||||
output += format_html(
|
||||
output_template,
|
||||
@ -825,17 +829,24 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
# Scan each ArchiveResult's output directory for the best file
|
||||
snap_dir = Path(self.output_dir)
|
||||
for result in self.archiveresult_set.filter(status='succeeded'):
|
||||
if not result.output:
|
||||
if not result.output_files and not result.output_str:
|
||||
continue
|
||||
|
||||
# Try to find the best output file for this extractor
|
||||
extractor_dir = snap_dir / result.extractor
|
||||
best_output = None
|
||||
|
||||
if result.output and (snap_dir / result.output).exists():
|
||||
# Use the explicit output path if it exists
|
||||
best_output = result.output
|
||||
elif extractor_dir.exists():
|
||||
# Check output_files first (new field)
|
||||
if result.output_files:
|
||||
first_file = next(iter(result.output_files.keys()), None)
|
||||
if first_file and (extractor_dir / first_file).exists():
|
||||
best_output = f'{result.extractor}/{first_file}'
|
||||
|
||||
# Fallback to output_str if it looks like a path
|
||||
if not best_output and result.output_str and (snap_dir / result.output_str).exists():
|
||||
best_output = result.output_str
|
||||
|
||||
if not best_output and extractor_dir.exists():
|
||||
# Intelligently find the best file in the extractor's directory
|
||||
best_output = find_best_output_in_dir(extractor_dir, result.extractor)
|
||||
|
||||
@ -873,14 +884,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Get the latest output that each archive method produced"""
|
||||
from archivebox.hooks import get_extractors
|
||||
from django.db.models import Q
|
||||
|
||||
latest: Dict[str, Any] = {}
|
||||
for archive_method in get_extractors():
|
||||
results = self.archiveresult_set.filter(extractor=archive_method)
|
||||
if status is not None:
|
||||
results = results.filter(status=status)
|
||||
results = results.filter(output__isnull=False).order_by('-start_ts')
|
||||
latest[archive_method] = results.first().output if results.exists() else None
|
||||
# Filter for results with output_files or output_str
|
||||
results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
|
||||
result = results.first()
|
||||
# Return embed_path() for backwards compatibility
|
||||
latest[archive_method] = result.embed_path() if result else None
|
||||
return latest
|
||||
|
||||
# =========================================================================
|
||||
@ -1021,7 +1036,23 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
|
||||
cmd = models.JSONField(default=None, null=True, blank=True)
|
||||
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
||||
output = models.CharField(max_length=1024, default=None, null=True, blank=True)
|
||||
|
||||
# New output fields (replacing old 'output' field)
|
||||
output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
|
||||
output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
|
||||
output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
|
||||
output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
|
||||
output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
|
||||
|
||||
# Binary FK (optional - set when hook reports cmd)
|
||||
binary = models.ForeignKey(
|
||||
'machine.InstalledBinary',
|
||||
on_delete=models.SET_NULL,
|
||||
null=True, blank=True,
|
||||
related_name='archiveresults',
|
||||
help_text='Primary binary used by this hook'
|
||||
)
|
||||
|
||||
start_ts = models.DateTimeField(default=None, null=True, blank=True)
|
||||
end_ts = models.DateTimeField(default=None, null=True, blank=True)
|
||||
|
||||
@ -1094,11 +1125,19 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
"""
|
||||
Get the relative path to the embeddable output file for this result.
|
||||
|
||||
Returns the output field if set and file exists, otherwise tries to
|
||||
Returns the first file from output_files if set, otherwise tries to
|
||||
find a reasonable default based on the extractor type.
|
||||
"""
|
||||
if self.output:
|
||||
return self.output
|
||||
# Check output_files dict for primary output
|
||||
if self.output_files:
|
||||
# Return first file from output_files (dict preserves insertion order)
|
||||
first_file = next(iter(self.output_files.keys()), None)
|
||||
if first_file:
|
||||
return f'{self.extractor}/{first_file}'
|
||||
|
||||
# Fallback: check output_str if it looks like a file path
|
||||
if self.output_str and ('/' in self.output_str or '.' in self.output_str):
|
||||
return self.output_str
|
||||
|
||||
# Try to find output file based on extractor's canonical output path
|
||||
canonical = self.snapshot.canonical_outputs()
|
||||
@ -1149,7 +1188,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
if not hook:
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output = f'No hook found for: {self.extractor}'
|
||||
self.output_str = f'No hook found for: {self.extractor}'
|
||||
self.retry_at = None
|
||||
self.save()
|
||||
return
|
||||
@ -1167,8 +1206,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
url=self.snapshot.url,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
)
|
||||
|
||||
# BACKGROUND HOOK - still running, return immediately
|
||||
if result is None:
|
||||
self.status = self.StatusChoices.STARTED
|
||||
self.start_ts = start_ts
|
||||
self.pwd = str(extractor_dir)
|
||||
self.save()
|
||||
return
|
||||
|
||||
end_ts = timezone.now()
|
||||
|
||||
# Get records from hook output (new JSONL format)
|
||||
records = result.get('records', [])
|
||||
|
||||
# Clean up empty output directory if no files were created
|
||||
output_files = result.get('output_files', [])
|
||||
if not output_files and extractor_dir.exists():
|
||||
@ -1179,14 +1230,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
except (OSError, RuntimeError):
|
||||
pass # Directory not empty or can't be removed, that's fine
|
||||
|
||||
# Determine status from return code and JSON output
|
||||
# Find the ArchiveResult record from hook output (if any)
|
||||
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
|
||||
output_json = result.get('output_json') or {}
|
||||
json_status = output_json.get('status')
|
||||
|
||||
if json_status == 'skipped':
|
||||
status = 'skipped'
|
||||
elif json_status == 'failed':
|
||||
status = 'failed'
|
||||
# Determine status from records, output_json, or return code
|
||||
if ar_records:
|
||||
# Use status from first ArchiveResult record
|
||||
hook_data = ar_records[0]
|
||||
status = hook_data.get('status', 'failed')
|
||||
elif output_json.get('status'):
|
||||
status = output_json['status']
|
||||
elif result['returncode'] == 0:
|
||||
status = 'succeeded'
|
||||
else:
|
||||
@ -1199,20 +1253,45 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
'skipped': self.StatusChoices.SKIPPED,
|
||||
}
|
||||
self.status = status_map.get(status, self.StatusChoices.FAILED)
|
||||
self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None
|
||||
|
||||
# Set output fields from records or output_json
|
||||
if ar_records:
|
||||
hook_data = ar_records[0]
|
||||
self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
|
||||
self.output_json = hook_data.get('output_json')
|
||||
# Set cmd from JSONL record
|
||||
if hook_data.get('cmd'):
|
||||
self.cmd = hook_data['cmd']
|
||||
self._set_binary_from_cmd(hook_data['cmd'])
|
||||
if hook_data.get('cmd_version'):
|
||||
self.cmd_version = hook_data['cmd_version'][:128]
|
||||
else:
|
||||
# Fallback to legacy output_json format
|
||||
self.output_str = output_json.get('output_str') or output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or ''
|
||||
self.output_json = output_json.get('output_json') if output_json.get('output_json') else None
|
||||
if output_json.get('cmd_version'):
|
||||
self.cmd_version = output_json['cmd_version'][:128]
|
||||
if output_json.get('cmd'):
|
||||
self.cmd = output_json['cmd']
|
||||
self._set_binary_from_cmd(output_json['cmd'])
|
||||
|
||||
self.start_ts = start_ts
|
||||
self.end_ts = end_ts
|
||||
self.retry_at = None
|
||||
self.pwd = str(extractor_dir)
|
||||
|
||||
# Save cmd and cmd_version from extractor output
|
||||
if output_json.get('cmd_version'):
|
||||
self.cmd_version = output_json['cmd_version'][:128] # Max length from model
|
||||
if output_json.get('cmd'):
|
||||
self.cmd = output_json['cmd']
|
||||
# Populate output_files, output_size, output_mimetypes from filesystem
|
||||
if extractor_dir.exists():
|
||||
self._populate_output_fields(extractor_dir)
|
||||
|
||||
self.save()
|
||||
|
||||
# Process side-effect records (InstalledBinary, Machine config, etc.)
|
||||
from archivebox.hooks import create_model_record
|
||||
for record in records:
|
||||
if record.get('type') != 'ArchiveResult':
|
||||
create_model_record(record.copy()) # Copy to avoid mutating original
|
||||
|
||||
# Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
|
||||
self._queue_urls_for_crawl(extractor_dir)
|
||||
|
||||
@ -1226,6 +1305,84 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
if self.status == self.StatusChoices.SUCCEEDED:
|
||||
self.trigger_search_indexing()
|
||||
|
||||
def _populate_output_fields(self, output_dir: Path) -> None:
|
||||
"""
|
||||
Walk output directory and populate output_files, output_size, output_mimetypes.
|
||||
"""
|
||||
import mimetypes
|
||||
from collections import defaultdict
|
||||
|
||||
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
|
||||
|
||||
# Track mimetypes and sizes for aggregation
|
||||
mime_sizes = defaultdict(int)
|
||||
total_size = 0
|
||||
output_files = {} # Dict keyed by relative path
|
||||
|
||||
for file_path in output_dir.rglob('*'):
|
||||
# Skip non-files and infrastructure files
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
if file_path.name in exclude_names:
|
||||
continue
|
||||
|
||||
# Get file stats
|
||||
try:
|
||||
stat = file_path.stat()
|
||||
mime_type, _ = mimetypes.guess_type(str(file_path))
|
||||
mime_type = mime_type or 'application/octet-stream'
|
||||
|
||||
# Track for ArchiveResult fields
|
||||
relative_path = str(file_path.relative_to(output_dir))
|
||||
output_files[relative_path] = {} # Empty dict, extensible for future metadata
|
||||
mime_sizes[mime_type] += stat.st_size
|
||||
total_size += stat.st_size
|
||||
except (OSError, IOError):
|
||||
continue
|
||||
|
||||
# Populate ArchiveResult fields
|
||||
self.output_files = output_files
|
||||
self.output_size = total_size
|
||||
|
||||
# Build output_mimetypes CSV (sorted by size descending)
|
||||
sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
|
||||
self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
|
||||
|
||||
def _set_binary_from_cmd(self, cmd: list) -> None:
|
||||
"""
|
||||
Find InstalledBinary for command and set binary FK.
|
||||
|
||||
Tries matching by absolute path first, then by binary name.
|
||||
Only matches binaries on the current machine.
|
||||
"""
|
||||
if not cmd:
|
||||
return
|
||||
|
||||
from machine.models import Machine
|
||||
|
||||
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
|
||||
machine = Machine.current()
|
||||
|
||||
# Try matching by absolute path first
|
||||
binary = InstalledBinary.objects.filter(
|
||||
abspath=bin_path_or_name,
|
||||
machine=machine
|
||||
).first()
|
||||
|
||||
if binary:
|
||||
self.binary = binary
|
||||
return
|
||||
|
||||
# Fallback: match by binary name
|
||||
bin_name = Path(bin_path_or_name).name
|
||||
binary = InstalledBinary.objects.filter(
|
||||
name=bin_name,
|
||||
machine=machine
|
||||
).first()
|
||||
|
||||
if binary:
|
||||
self.binary = binary
|
||||
|
||||
def _update_snapshot_title(self, extractor_dir: Path):
|
||||
"""
|
||||
Update snapshot title from title extractor output.
|
||||
@ -1325,3 +1482,120 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
def output_dir(self) -> Path:
|
||||
"""Get the output directory for this extractor's results."""
|
||||
return Path(self.snapshot.output_dir) / self.extractor
|
||||
|
||||
def is_background_hook(self) -> bool:
|
||||
"""Check if this ArchiveResult is for a background hook."""
|
||||
extractor_dir = Path(self.pwd) if self.pwd else None
|
||||
if not extractor_dir:
|
||||
return False
|
||||
pid_file = extractor_dir / 'hook.pid'
|
||||
return pid_file.exists()
|
||||
|
||||
def check_background_completed(self) -> bool:
|
||||
"""
|
||||
Check if background hook process has exited.
|
||||
|
||||
Returns:
|
||||
True if completed (process exited), False if still running
|
||||
"""
|
||||
extractor_dir = Path(self.pwd) if self.pwd else None
|
||||
if not extractor_dir:
|
||||
return True # No pwd = completed or failed to start
|
||||
|
||||
pid_file = extractor_dir / 'hook.pid'
|
||||
if not pid_file.exists():
|
||||
return True # No PID file = completed or failed to start
|
||||
|
||||
try:
|
||||
pid = int(pid_file.read_text().strip())
|
||||
os.kill(pid, 0) # Signal 0 = check if process exists
|
||||
return False # Still running
|
||||
except (OSError, ValueError):
|
||||
return True # Process exited or invalid PID
|
||||
|
||||
def finalize_background_hook(self) -> None:
|
||||
"""
|
||||
Collect final results from completed background hook.
|
||||
|
||||
Same logic as run() but for background hooks that already started.
|
||||
"""
|
||||
from archivebox.hooks import create_model_record
|
||||
|
||||
extractor_dir = Path(self.pwd) if self.pwd else None
|
||||
if not extractor_dir or not extractor_dir.exists():
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output_str = 'Background hook output directory not found'
|
||||
self.end_ts = timezone.now()
|
||||
self.retry_at = None
|
||||
self.save()
|
||||
return
|
||||
|
||||
stdout_file = extractor_dir / 'stdout.log'
|
||||
stderr_file = extractor_dir / 'stderr.log'
|
||||
|
||||
# Read logs
|
||||
stdout = stdout_file.read_text() if stdout_file.exists() else ''
|
||||
|
||||
# Parse JSONL output
|
||||
records = []
|
||||
for line in stdout.splitlines():
|
||||
line = line.strip()
|
||||
if not line or not line.startswith('{'):
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if 'type' in data:
|
||||
records.append(data)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Find the ArchiveResult record
|
||||
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
|
||||
|
||||
if ar_records:
|
||||
hook_data = ar_records[0]
|
||||
|
||||
# Apply hook's data
|
||||
status_str = hook_data.get('status', 'failed')
|
||||
status_map = {
|
||||
'succeeded': self.StatusChoices.SUCCEEDED,
|
||||
'failed': self.StatusChoices.FAILED,
|
||||
'skipped': self.StatusChoices.SKIPPED,
|
||||
}
|
||||
self.status = status_map.get(status_str, self.StatusChoices.FAILED)
|
||||
|
||||
self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
|
||||
self.output_json = hook_data.get('output_json')
|
||||
|
||||
# Determine binary FK from cmd
|
||||
if hook_data.get('cmd'):
|
||||
self.cmd = hook_data['cmd']
|
||||
self._set_binary_from_cmd(hook_data['cmd'])
|
||||
if hook_data.get('cmd_version'):
|
||||
self.cmd_version = hook_data['cmd_version'][:128]
|
||||
else:
|
||||
# No output = failed
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output_str = 'Background hook did not output ArchiveResult'
|
||||
|
||||
self.end_ts = timezone.now()
|
||||
self.retry_at = None
|
||||
|
||||
# Populate output fields from filesystem
|
||||
if extractor_dir.exists():
|
||||
self._populate_output_fields(extractor_dir)
|
||||
|
||||
self.save()
|
||||
|
||||
# Create any side-effect records
|
||||
for record in records:
|
||||
if record.get('type') != 'ArchiveResult':
|
||||
create_model_record(record.copy())
|
||||
|
||||
# Cleanup PID files and empty logs
|
||||
pid_file = extractor_dir / 'hook.pid'
|
||||
pid_file.unlink(missing_ok=True)
|
||||
if stdout_file.exists() and stdout_file.stat().st_size == 0:
|
||||
stdout_file.unlink()
|
||||
if stderr_file.exists() and stderr_file.stat().st_size == 0:
|
||||
stderr_file.unlink()
|
||||
|
||||
@ -59,11 +59,22 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
# if no archiveresults exist yet, it's not finished
|
||||
if not self.snapshot.archiveresult_set.exists():
|
||||
return False
|
||||
|
||||
|
||||
# if archiveresults exist but are still pending, it's not finished
|
||||
if self.snapshot.pending_archiveresults().exists():
|
||||
return False
|
||||
|
||||
|
||||
# Check for background hooks that are still running
|
||||
started_results = self.snapshot.archiveresult_set.filter(
|
||||
status=ArchiveResult.StatusChoices.STARTED
|
||||
)
|
||||
for result in started_results:
|
||||
if not result.check_background_completed():
|
||||
return False # Still running
|
||||
|
||||
# Completed - finalize it
|
||||
result.finalize_background_hook()
|
||||
|
||||
# otherwise archiveresults exist and are all finished, so it's finished
|
||||
return True
|
||||
|
||||
@ -184,10 +195,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
|
||||
def is_backoff(self) -> bool:
|
||||
"""Check if we should backoff and retry later."""
|
||||
# Backoff if status is still started (extractor didn't complete) and output is None
|
||||
# Backoff if status is still started (extractor didn't complete) and output_str is empty
|
||||
return (
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
|
||||
self.archiveresult.output is None
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
|
||||
not self.archiveresult.output_str
|
||||
)
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
|
||||
@ -80,7 +80,7 @@ def extractor_thumbnail(context, result) -> str:
|
||||
return ''
|
||||
|
||||
# Use embed_path() for the display path (includes canonical paths)
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
|
||||
|
||||
# Create a mini template and render it with context
|
||||
try:
|
||||
@ -109,7 +109,7 @@ def extractor_embed(context, result) -> str:
|
||||
if not template_str:
|
||||
return ''
|
||||
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
|
||||
|
||||
try:
|
||||
tpl = template.Template(template_str)
|
||||
@ -137,7 +137,7 @@ def extractor_fullscreen(context, result) -> str:
|
||||
if not template_str:
|
||||
return ''
|
||||
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
|
||||
|
||||
try:
|
||||
tpl = template.Template(template_str)
|
||||
|
||||
@ -68,6 +68,8 @@ class HookResult(TypedDict, total=False):
|
||||
output_files: List[str]
|
||||
duration_ms: int
|
||||
hook: str
|
||||
# New fields for JSONL parsing
|
||||
records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field
|
||||
|
||||
|
||||
def discover_hooks(event_name: str) -> List[Path]:
|
||||
@ -268,7 +270,9 @@ def run_hook(
|
||||
files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
|
||||
|
||||
# Detect if this is a background hook (long-running daemon)
|
||||
is_background = '__background' in script.stem
|
||||
# New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
|
||||
# Old convention: __background in stem (for backwards compatibility)
|
||||
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||
|
||||
# Set up output files for ALL hooks (useful for debugging)
|
||||
stdout_file = output_dir / 'stdout.log'
|
||||
@ -322,13 +326,44 @@ def run_hook(
|
||||
# Exclude the log files themselves from new_files
|
||||
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
|
||||
|
||||
# Parse RESULT_JSON from stdout
|
||||
# Parse JSONL output from stdout
|
||||
# Supports both new JSONL format (any line starting with { that has 'type')
|
||||
# and legacy RESULT_JSON= format for backwards compatibility
|
||||
output_json = None
|
||||
records = []
|
||||
plugin_name = script.parent.name # Plugin directory name (e.g., 'wget')
|
||||
|
||||
for line in stdout.splitlines():
|
||||
if line.startswith('RESULT_JSON='):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# New JSONL format: any line starting with { that has 'type' field
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
output_json = json.loads(line[len('RESULT_JSON='):])
|
||||
break
|
||||
data = json.loads(line)
|
||||
if 'type' in data:
|
||||
# Add plugin metadata to every record
|
||||
data['plugin'] = plugin_name
|
||||
data['plugin_hook'] = str(script)
|
||||
records.append(data)
|
||||
# For backwards compatibility, also set output_json for first ArchiveResult
|
||||
if data.get('type') == 'ArchiveResult' and output_json is None:
|
||||
output_json = data
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Legacy format: RESULT_JSON=...
|
||||
elif line.startswith('RESULT_JSON='):
|
||||
try:
|
||||
data = json.loads(line[len('RESULT_JSON='):])
|
||||
if output_json is None:
|
||||
output_json = data
|
||||
# Convert legacy format to new format
|
||||
data['type'] = 'ArchiveResult'
|
||||
data['plugin'] = plugin_name
|
||||
data['plugin_hook'] = str(script)
|
||||
records.append(data)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
@ -348,6 +383,7 @@ def run_hook(
|
||||
output_files=new_files,
|
||||
duration_ms=duration_ms,
|
||||
hook=str(script),
|
||||
records=records,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@ -360,6 +396,7 @@ def run_hook(
|
||||
output_files=[],
|
||||
duration_ms=duration_ms,
|
||||
hook=str(script),
|
||||
records=[],
|
||||
)
|
||||
|
||||
|
||||
@ -1104,3 +1141,112 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
|
||||
return templates
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Hook Result Processing Helpers
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
|
||||
"""
|
||||
Find InstalledBinary for a command, trying abspath first then name.
|
||||
Only matches binaries on the current machine.
|
||||
|
||||
Args:
|
||||
cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url'])
|
||||
machine_id: Current machine ID
|
||||
|
||||
Returns:
|
||||
Binary ID as string if found, None otherwise
|
||||
"""
|
||||
if not cmd:
|
||||
return None
|
||||
|
||||
from machine.models import InstalledBinary
|
||||
|
||||
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
|
||||
|
||||
# Try matching by absolute path first
|
||||
binary = InstalledBinary.objects.filter(
|
||||
abspath=bin_path_or_name,
|
||||
machine_id=machine_id
|
||||
).first()
|
||||
|
||||
if binary:
|
||||
return str(binary.id)
|
||||
|
||||
# Fallback: match by binary name
|
||||
bin_name = Path(bin_path_or_name).name
|
||||
binary = InstalledBinary.objects.filter(
|
||||
name=bin_name,
|
||||
machine_id=machine_id
|
||||
).first()
|
||||
|
||||
return str(binary.id) if binary else None
|
||||
|
||||
|
||||
def create_model_record(record: Dict[str, Any]) -> Any:
|
||||
"""
|
||||
Generic helper to create/update model instances from hook JSONL output.
|
||||
|
||||
Args:
|
||||
record: Dict with 'type' field and model data
|
||||
|
||||
Returns:
|
||||
Created/updated model instance, or None if type unknown
|
||||
"""
|
||||
from machine.models import InstalledBinary, Machine
|
||||
|
||||
record_type = record.pop('type', None)
|
||||
if not record_type:
|
||||
return None
|
||||
|
||||
# Remove plugin metadata (not model fields)
|
||||
record.pop('plugin', None)
|
||||
record.pop('plugin_hook', None)
|
||||
|
||||
if record_type == 'InstalledBinary':
|
||||
# InstalledBinary requires machine FK
|
||||
machine = Machine.current()
|
||||
record.setdefault('machine', machine)
|
||||
|
||||
# Required fields check
|
||||
name = record.get('name')
|
||||
abspath = record.get('abspath')
|
||||
if not name or not abspath:
|
||||
return None
|
||||
|
||||
obj, created = InstalledBinary.objects.update_or_create(
|
||||
machine=machine,
|
||||
name=name,
|
||||
defaults={
|
||||
'abspath': abspath,
|
||||
'version': record.get('version', ''),
|
||||
'sha256': record.get('sha256', ''),
|
||||
'binprovider': record.get('binprovider', 'env'),
|
||||
}
|
||||
)
|
||||
return obj
|
||||
|
||||
elif record_type == 'Machine':
|
||||
# Machine config update (special _method handling)
|
||||
method = record.pop('_method', None)
|
||||
if method == 'update':
|
||||
key = record.get('key')
|
||||
value = record.get('value')
|
||||
if key and value:
|
||||
machine = Machine.current()
|
||||
if not machine.config:
|
||||
machine.config = {}
|
||||
machine.config[key] = value
|
||||
machine.save(update_fields=['config'])
|
||||
return machine
|
||||
return None
|
||||
|
||||
# Add more types as needed (Dependency, Snapshot, etc.)
|
||||
else:
|
||||
# Unknown type - log warning but don't fail
|
||||
import sys
|
||||
print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user