Implement hook architecture with JSONL output support

Phase 1: Database migration for new ArchiveResult fields
- Add output_str (TextField) for human-readable summary
- Add output_json (JSONField) for structured metadata
- Add output_files (JSONField) for dict of {relative_path: {}}
- Add output_size (BigIntegerField) for total bytes
- Add output_mimetypes (CharField) for CSV of mimetypes
- Add binary FK to InstalledBinary (optional)
- Migrate existing 'output' field to new split fields

Phase 3: Update run_hook() for JSONL parsing
- Support new JSONL format (any line with {type: 'ModelName', ...})
- Maintain backwards compatibility with RESULT_JSON= format
- Add plugin metadata to each parsed record
- Detect background hooks with .bg. suffix in filename
- Add find_binary_for_cmd() helper function
- Add create_model_record() for processing side-effect records

Phase 6: Update ArchiveResult.run()
- Handle background hooks (return immediately when result is None)
- Process 'records' from HookResult for side-effect models
- Use new output fields (output_str, output_json, output_files, etc.)
- Call create_model_record() for InstalledBinary, Machine updates

Phase 7: Add background hook support
- Add is_background_hook() method to ArchiveResult
- Add check_background_completed() to check if process exited
- Add finalize_background_hook() to collect results from completed hooks
- Update SnapshotMachine.is_finished() to check/finalize background hooks
- Update _populate_output_fields() to walk directory and populate stats

Also updated references to old 'output' field in:
- admin_archiveresults.py
- statemachines.py
- templatetags/core_tags.py
This commit is contained in:
Claude 2025-12-27 08:38:49 +00:00
parent 35dd9acafe
commit 3d985fa8c8
No known key found for this signature in database
7 changed files with 633 additions and 55 deletions

View File

@ -47,7 +47,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
# Truncate output for display
full_output = result.output or '-'
full_output = result.output_str or '-'
output_display = full_output[:60]
if len(full_output) > 60:
output_display += '...'
@ -55,8 +55,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
# Get full command as tooltip
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
# Build output link
output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
# Build output link - use embed_path() which checks output_files first
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_link = f'/archive/{result.snapshot.timestamp}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
# Get version - try cmd_version field
version = result.cmd_version if result.cmd_version else '-'
@ -336,27 +337,29 @@ class ArchiveResultAdmin(BaseModelAdmin):
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
)
def output_str(self, result):
# Determine output link path - use output if file exists, otherwise link to index
output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
def output_display(self, result):
# Determine output link path - use embed_path() which checks output_files
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
return format_html(
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
result.snapshot.timestamp,
output_path,
result.output,
result.output_str,
)
def output_summary(self, result):
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
output_str = format_html(
output_html = format_html(
'<pre style="display: inline-block">{}</pre><br/>',
result.output,
result.output_str,
)
output_str += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
path_from_output_str = (snapshot_dir / (result.output or ''))
output_str += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(result.output))
if os.access(path_from_output_str, os.R_OK):
root_dir = str(path_from_output_str)
output_html += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
path_from_embed = (snapshot_dir / (embed_path or ''))
output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))
if os.access(path_from_embed, os.R_OK):
root_dir = str(path_from_embed)
else:
root_dir = str(snapshot_dir)

View File

@ -0,0 +1,80 @@
# Generated by Django for hook architecture support
# Phase 1: Add new ArchiveResult fields for hook output
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('core', '0028_snapshot_fs_version'),
('machine', '0002_rename_custom_cmds_to_overrides'),
]
operations = [
# Add new output fields (keep old 'output' temporarily for migration)
migrations.AddField(
model_name='archiveresult',
name='output_str',
field=models.TextField(
blank=True,
default='',
help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(
null=True,
blank=True,
default=None,
help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(
default=dict,
help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(
default=0,
help_text='Total recursive size in bytes of all output files'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(
max_length=512,
blank=True,
default='',
help_text='CSV of mimetypes sorted by size descending'
),
),
# Add binary FK (optional)
migrations.AddField(
model_name='archiveresult',
name='binary',
field=models.ForeignKey(
'machine.InstalledBinary',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='archiveresults',
help_text='Primary binary used by this hook (optional)'
),
),
]

View File

@ -0,0 +1,64 @@
# Generated by Django for hook architecture support
# Phase 1: Migrate existing 'output' field to new split fields
from django.db import migrations
import json
def migrate_output_field(apps, schema_editor):
"""
Migrate existing 'output' field to new split fields.
Logic:
- If output contains JSON {...}, move to output_json
- Otherwise, move to output_str
"""
ArchiveResult = apps.get_model('core', 'ArchiveResult')
for ar in ArchiveResult.objects.all().iterator():
old_output = ar.output or ''
# Case 1: JSON output
if old_output.strip().startswith('{'):
try:
parsed = json.loads(old_output)
ar.output_json = parsed
ar.output_str = ''
except json.JSONDecodeError:
# Not valid JSON, treat as string
ar.output_str = old_output
# Case 2: File path or plain string
else:
ar.output_str = old_output
ar.save(update_fields=['output_str', 'output_json'])
def reverse_migrate(apps, schema_editor):
"""Reverse migration - copy output_str back to output."""
ArchiveResult = apps.get_model('core', 'ArchiveResult')
for ar in ArchiveResult.objects.all().iterator():
if ar.output_json:
ar.output = json.dumps(ar.output_json)
else:
ar.output = ar.output_str or ''
ar.save(update_fields=['output'])
class Migration(migrations.Migration):
dependencies = [
('core', '0029_archiveresult_hook_fields'),
]
operations = [
migrations.RunPython(migrate_output_field, reverse_migrate),
# Now safe to remove old 'output' field
migrations.RemoveField(
model_name='archiveresult',
name='output',
),
]

View File

@ -36,7 +36,7 @@ from archivebox.base_models.models import (
from workers.models import ModelWithStateMachine
from workers.tasks import bg_archive_snapshot
from crawls.models import Crawl
from machine.models import NetworkInterface
from machine.models import NetworkInterface, InstalledBinary
@ -485,9 +485,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def calc_icons():
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
else:
archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
# Filter for results that have either output_files or output_str
from django.db.models import Q
archive_results = {r.extractor: r for r in self.archiveresult_set.filter(
Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
)}
path = self.archive_path
canon = self.canonical_outputs()
@ -499,7 +503,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
for extractor in all_extractors:
result = archive_results.get(extractor)
existing = result and result.status == 'succeeded' and result.output
existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
icon = get_extractor_icon(extractor)
output += format_html(
output_template,
@ -825,17 +829,24 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Scan each ArchiveResult's output directory for the best file
snap_dir = Path(self.output_dir)
for result in self.archiveresult_set.filter(status='succeeded'):
if not result.output:
if not result.output_files and not result.output_str:
continue
# Try to find the best output file for this extractor
extractor_dir = snap_dir / result.extractor
best_output = None
if result.output and (snap_dir / result.output).exists():
# Use the explicit output path if it exists
best_output = result.output
elif extractor_dir.exists():
# Check output_files first (new field)
if result.output_files:
first_file = next(iter(result.output_files.keys()), None)
if first_file and (extractor_dir / first_file).exists():
best_output = f'{result.extractor}/{first_file}'
# Fallback to output_str if it looks like a path
if not best_output and result.output_str and (snap_dir / result.output_str).exists():
best_output = result.output_str
if not best_output and extractor_dir.exists():
# Intelligently find the best file in the extractor's directory
best_output = find_best_output_in_dir(extractor_dir, result.extractor)
@ -873,14 +884,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
"""Get the latest output that each archive method produced"""
from archivebox.hooks import get_extractors
from django.db.models import Q
latest: Dict[str, Any] = {}
for archive_method in get_extractors():
results = self.archiveresult_set.filter(extractor=archive_method)
if status is not None:
results = results.filter(status=status)
results = results.filter(output__isnull=False).order_by('-start_ts')
latest[archive_method] = results.first().output if results.exists() else None
# Filter for results with output_files or output_str
results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
result = results.first()
# Return embed_path() for backwards compatibility
latest[archive_method] = result.embed_path() if result else None
return latest
# =========================================================================
@ -1021,7 +1036,23 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
cmd = models.JSONField(default=None, null=True, blank=True)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
output = models.CharField(max_length=1024, default=None, null=True, blank=True)
# New output fields (replacing old 'output' field)
output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
# Binary FK (optional - set when hook reports cmd)
binary = models.ForeignKey(
'machine.InstalledBinary',
on_delete=models.SET_NULL,
null=True, blank=True,
related_name='archiveresults',
help_text='Primary binary used by this hook'
)
start_ts = models.DateTimeField(default=None, null=True, blank=True)
end_ts = models.DateTimeField(default=None, null=True, blank=True)
@ -1094,11 +1125,19 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
"""
Get the relative path to the embeddable output file for this result.
Returns the output field if set and file exists, otherwise tries to
Returns the first file from output_files if set, otherwise tries to
find a reasonable default based on the extractor type.
"""
if self.output:
return self.output
# Check output_files dict for primary output
if self.output_files:
# Return first file from output_files (dict preserves insertion order)
first_file = next(iter(self.output_files.keys()), None)
if first_file:
return f'{self.extractor}/{first_file}'
# Fallback: check output_str if it looks like a file path
if self.output_str and ('/' in self.output_str or '.' in self.output_str):
return self.output_str
# Try to find output file based on extractor's canonical output path
canonical = self.snapshot.canonical_outputs()
@ -1149,7 +1188,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if not hook:
self.status = self.StatusChoices.FAILED
self.output = f'No hook found for: {self.extractor}'
self.output_str = f'No hook found for: {self.extractor}'
self.retry_at = None
self.save()
return
@ -1167,8 +1206,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
)
# BACKGROUND HOOK - still running, return immediately
if result is None:
self.status = self.StatusChoices.STARTED
self.start_ts = start_ts
self.pwd = str(extractor_dir)
self.save()
return
end_ts = timezone.now()
# Get records from hook output (new JSONL format)
records = result.get('records', [])
# Clean up empty output directory if no files were created
output_files = result.get('output_files', [])
if not output_files and extractor_dir.exists():
@ -1179,14 +1230,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
except (OSError, RuntimeError):
pass # Directory not empty or can't be removed, that's fine
# Determine status from return code and JSON output
# Find the ArchiveResult record from hook output (if any)
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
output_json = result.get('output_json') or {}
json_status = output_json.get('status')
if json_status == 'skipped':
status = 'skipped'
elif json_status == 'failed':
status = 'failed'
# Determine status from records, output_json, or return code
if ar_records:
# Use status from first ArchiveResult record
hook_data = ar_records[0]
status = hook_data.get('status', 'failed')
elif output_json.get('status'):
status = output_json['status']
elif result['returncode'] == 0:
status = 'succeeded'
else:
@ -1199,20 +1253,45 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
'skipped': self.StatusChoices.SKIPPED,
}
self.status = status_map.get(status, self.StatusChoices.FAILED)
self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None
# Set output fields from records or output_json
if ar_records:
hook_data = ar_records[0]
self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
self.output_json = hook_data.get('output_json')
# Set cmd from JSONL record
if hook_data.get('cmd'):
self.cmd = hook_data['cmd']
self._set_binary_from_cmd(hook_data['cmd'])
if hook_data.get('cmd_version'):
self.cmd_version = hook_data['cmd_version'][:128]
else:
# Fallback to legacy output_json format
self.output_str = output_json.get('output_str') or output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or ''
self.output_json = output_json.get('output_json') if output_json.get('output_json') else None
if output_json.get('cmd_version'):
self.cmd_version = output_json['cmd_version'][:128]
if output_json.get('cmd'):
self.cmd = output_json['cmd']
self._set_binary_from_cmd(output_json['cmd'])
self.start_ts = start_ts
self.end_ts = end_ts
self.retry_at = None
self.pwd = str(extractor_dir)
# Save cmd and cmd_version from extractor output
if output_json.get('cmd_version'):
self.cmd_version = output_json['cmd_version'][:128] # Max length from model
if output_json.get('cmd'):
self.cmd = output_json['cmd']
# Populate output_files, output_size, output_mimetypes from filesystem
if extractor_dir.exists():
self._populate_output_fields(extractor_dir)
self.save()
# Process side-effect records (InstalledBinary, Machine config, etc.)
from archivebox.hooks import create_model_record
for record in records:
if record.get('type') != 'ArchiveResult':
create_model_record(record.copy()) # Copy to avoid mutating original
# Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
self._queue_urls_for_crawl(extractor_dir)
@ -1226,6 +1305,84 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if self.status == self.StatusChoices.SUCCEEDED:
self.trigger_search_indexing()
def _populate_output_fields(self, output_dir: Path) -> None:
"""
Walk output directory and populate output_files, output_size, output_mimetypes.
"""
import mimetypes
from collections import defaultdict
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
# Track mimetypes and sizes for aggregation
mime_sizes = defaultdict(int)
total_size = 0
output_files = {} # Dict keyed by relative path
for file_path in output_dir.rglob('*'):
# Skip non-files and infrastructure files
if not file_path.is_file():
continue
if file_path.name in exclude_names:
continue
# Get file stats
try:
stat = file_path.stat()
mime_type, _ = mimetypes.guess_type(str(file_path))
mime_type = mime_type or 'application/octet-stream'
# Track for ArchiveResult fields
relative_path = str(file_path.relative_to(output_dir))
output_files[relative_path] = {} # Empty dict, extensible for future metadata
mime_sizes[mime_type] += stat.st_size
total_size += stat.st_size
except (OSError, IOError):
continue
# Populate ArchiveResult fields
self.output_files = output_files
self.output_size = total_size
# Build output_mimetypes CSV (sorted by size descending)
sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
def _set_binary_from_cmd(self, cmd: list) -> None:
"""
Find InstalledBinary for command and set binary FK.
Tries matching by absolute path first, then by binary name.
Only matches binaries on the current machine.
"""
if not cmd:
return
from machine.models import Machine
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
machine = Machine.current()
# Try matching by absolute path first
binary = InstalledBinary.objects.filter(
abspath=bin_path_or_name,
machine=machine
).first()
if binary:
self.binary = binary
return
# Fallback: match by binary name
bin_name = Path(bin_path_or_name).name
binary = InstalledBinary.objects.filter(
name=bin_name,
machine=machine
).first()
if binary:
self.binary = binary
def _update_snapshot_title(self, extractor_dir: Path):
"""
Update snapshot title from title extractor output.
@ -1325,3 +1482,120 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def output_dir(self) -> Path:
"""Get the output directory for this extractor's results."""
return Path(self.snapshot.output_dir) / self.extractor
def is_background_hook(self) -> bool:
"""Check if this ArchiveResult is for a background hook."""
extractor_dir = Path(self.pwd) if self.pwd else None
if not extractor_dir:
return False
pid_file = extractor_dir / 'hook.pid'
return pid_file.exists()
def check_background_completed(self) -> bool:
"""
Check if background hook process has exited.
Returns:
True if completed (process exited), False if still running
"""
extractor_dir = Path(self.pwd) if self.pwd else None
if not extractor_dir:
return True # No pwd = completed or failed to start
pid_file = extractor_dir / 'hook.pid'
if not pid_file.exists():
return True # No PID file = completed or failed to start
try:
pid = int(pid_file.read_text().strip())
os.kill(pid, 0) # Signal 0 = check if process exists
return False # Still running
except (OSError, ValueError):
return True # Process exited or invalid PID
def finalize_background_hook(self) -> None:
"""
Collect final results from completed background hook.
Same logic as run() but for background hooks that already started.
"""
from archivebox.hooks import create_model_record
extractor_dir = Path(self.pwd) if self.pwd else None
if not extractor_dir or not extractor_dir.exists():
self.status = self.StatusChoices.FAILED
self.output_str = 'Background hook output directory not found'
self.end_ts = timezone.now()
self.retry_at = None
self.save()
return
stdout_file = extractor_dir / 'stdout.log'
stderr_file = extractor_dir / 'stderr.log'
# Read logs
stdout = stdout_file.read_text() if stdout_file.exists() else ''
# Parse JSONL output
records = []
for line in stdout.splitlines():
line = line.strip()
if not line or not line.startswith('{'):
continue
try:
data = json.loads(line)
if 'type' in data:
records.append(data)
except json.JSONDecodeError:
continue
# Find the ArchiveResult record
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
if ar_records:
hook_data = ar_records[0]
# Apply hook's data
status_str = hook_data.get('status', 'failed')
status_map = {
'succeeded': self.StatusChoices.SUCCEEDED,
'failed': self.StatusChoices.FAILED,
'skipped': self.StatusChoices.SKIPPED,
}
self.status = status_map.get(status_str, self.StatusChoices.FAILED)
self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
self.output_json = hook_data.get('output_json')
# Determine binary FK from cmd
if hook_data.get('cmd'):
self.cmd = hook_data['cmd']
self._set_binary_from_cmd(hook_data['cmd'])
if hook_data.get('cmd_version'):
self.cmd_version = hook_data['cmd_version'][:128]
else:
# No output = failed
self.status = self.StatusChoices.FAILED
self.output_str = 'Background hook did not output ArchiveResult'
self.end_ts = timezone.now()
self.retry_at = None
# Populate output fields from filesystem
if extractor_dir.exists():
self._populate_output_fields(extractor_dir)
self.save()
# Create any side-effect records
for record in records:
if record.get('type') != 'ArchiveResult':
create_model_record(record.copy())
# Cleanup PID files and empty logs
pid_file = extractor_dir / 'hook.pid'
pid_file.unlink(missing_ok=True)
if stdout_file.exists() and stdout_file.stat().st_size == 0:
stdout_file.unlink()
if stderr_file.exists() and stderr_file.stat().st_size == 0:
stderr_file.unlink()

View File

@ -59,11 +59,22 @@ class SnapshotMachine(StateMachine, strict_states=True):
# if no archiveresults exist yet, it's not finished
if not self.snapshot.archiveresult_set.exists():
return False
# if archiveresults exist but are still pending, it's not finished
if self.snapshot.pending_archiveresults().exists():
return False
# Check for background hooks that are still running
started_results = self.snapshot.archiveresult_set.filter(
status=ArchiveResult.StatusChoices.STARTED
)
for result in started_results:
if not result.check_background_completed():
return False # Still running
# Completed - finalize it
result.finalize_background_hook()
# otherwise archiveresults exist and are all finished, so it's finished
return True
@ -184,10 +195,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
def is_backoff(self) -> bool:
"""Check if we should backoff and retry later."""
# Backoff if status is still started (extractor didn't complete) and output is None
# Backoff if status is still started (extractor didn't complete) and output_str is empty
return (
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
self.archiveresult.output is None
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
not self.archiveresult.output_str
)
def is_finished(self) -> bool:

View File

@ -80,7 +80,7 @@ def extractor_thumbnail(context, result) -> str:
return ''
# Use embed_path() for the display path (includes canonical paths)
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
# Create a mini template and render it with context
try:
@ -109,7 +109,7 @@ def extractor_embed(context, result) -> str:
if not template_str:
return ''
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
try:
tpl = template.Template(template_str)
@ -137,7 +137,7 @@ def extractor_fullscreen(context, result) -> str:
if not template_str:
return ''
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
try:
tpl = template.Template(template_str)

View File

@ -68,6 +68,8 @@ class HookResult(TypedDict, total=False):
output_files: List[str]
duration_ms: int
hook: str
# New fields for JSONL parsing
records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field
def discover_hooks(event_name: str) -> List[Path]:
@ -268,7 +270,9 @@ def run_hook(
files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
# Detect if this is a background hook (long-running daemon)
is_background = '__background' in script.stem
# New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
# Old convention: __background in stem (for backwards compatibility)
is_background = '.bg.' in script.name or '__background' in script.stem
# Set up output files for ALL hooks (useful for debugging)
stdout_file = output_dir / 'stdout.log'
@ -322,13 +326,44 @@ def run_hook(
# Exclude the log files themselves from new_files
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
# Parse RESULT_JSON from stdout
# Parse JSONL output from stdout
# Supports both new JSONL format (any line starting with { that has 'type')
# and legacy RESULT_JSON= format for backwards compatibility
output_json = None
records = []
plugin_name = script.parent.name # Plugin directory name (e.g., 'wget')
for line in stdout.splitlines():
if line.startswith('RESULT_JSON='):
line = line.strip()
if not line:
continue
# New JSONL format: any line starting with { that has 'type' field
if line.startswith('{'):
try:
output_json = json.loads(line[len('RESULT_JSON='):])
break
data = json.loads(line)
if 'type' in data:
# Add plugin metadata to every record
data['plugin'] = plugin_name
data['plugin_hook'] = str(script)
records.append(data)
# For backwards compatibility, also set output_json for first ArchiveResult
if data.get('type') == 'ArchiveResult' and output_json is None:
output_json = data
except json.JSONDecodeError:
pass
# Legacy format: RESULT_JSON=...
elif line.startswith('RESULT_JSON='):
try:
data = json.loads(line[len('RESULT_JSON='):])
if output_json is None:
output_json = data
# Convert legacy format to new format
data['type'] = 'ArchiveResult'
data['plugin'] = plugin_name
data['plugin_hook'] = str(script)
records.append(data)
except json.JSONDecodeError:
pass
@ -348,6 +383,7 @@ def run_hook(
output_files=new_files,
duration_ms=duration_ms,
hook=str(script),
records=records,
)
except Exception as e:
@ -360,6 +396,7 @@ def run_hook(
output_files=[],
duration_ms=duration_ms,
hook=str(script),
records=[],
)
@ -1104,3 +1141,112 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
return templates
# =============================================================================
# Hook Result Processing Helpers
# =============================================================================
def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
"""
Find InstalledBinary for a command, trying abspath first then name.
Only matches binaries on the current machine.
Args:
cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url'])
machine_id: Current machine ID
Returns:
Binary ID as string if found, None otherwise
"""
if not cmd:
return None
from machine.models import InstalledBinary
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
# Try matching by absolute path first
binary = InstalledBinary.objects.filter(
abspath=bin_path_or_name,
machine_id=machine_id
).first()
if binary:
return str(binary.id)
# Fallback: match by binary name
bin_name = Path(bin_path_or_name).name
binary = InstalledBinary.objects.filter(
name=bin_name,
machine_id=machine_id
).first()
return str(binary.id) if binary else None
def create_model_record(record: Dict[str, Any]) -> Any:
"""
Generic helper to create/update model instances from hook JSONL output.
Args:
record: Dict with 'type' field and model data
Returns:
Created/updated model instance, or None if type unknown
"""
from machine.models import InstalledBinary, Machine
record_type = record.pop('type', None)
if not record_type:
return None
# Remove plugin metadata (not model fields)
record.pop('plugin', None)
record.pop('plugin_hook', None)
if record_type == 'InstalledBinary':
# InstalledBinary requires machine FK
machine = Machine.current()
record.setdefault('machine', machine)
# Required fields check
name = record.get('name')
abspath = record.get('abspath')
if not name or not abspath:
return None
obj, created = InstalledBinary.objects.update_or_create(
machine=machine,
name=name,
defaults={
'abspath': abspath,
'version': record.get('version', ''),
'sha256': record.get('sha256', ''),
'binprovider': record.get('binprovider', 'env'),
}
)
return obj
elif record_type == 'Machine':
# Machine config update (special _method handling)
method = record.pop('_method', None)
if method == 'update':
key = record.get('key')
value = record.get('value')
if key and value:
machine = Machine.current()
if not machine.config:
machine.config = {}
machine.config[key] = value
machine.save(update_fields=['config'])
return machine
return None
# Add more types as needed (Dependency, Snapshot, etc.)
else:
# Unknown type - log warning but don't fail
import sys
print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
return None