Implement hook architecture with JSONL output support

Phase 1: Database migration for new ArchiveResult fields - Add output_str (TextField) for human-readable summary - Add output_json (JSONField) for structured metadata - Add output_files (JSONField) for dict of {relative_path: {}} - Add output_size (BigIntegerField) for total bytes - Add output_mimetypes (CharField) for CSV of mimetypes - Add binary FK to InstalledBinary (optional) - Migrate existing 'output' field to new split fields Phase 3: Update run_hook() for JSONL parsing - Support new JSONL format (any line with {type: 'ModelName', ...}) - Maintain backwards compatibility with RESULT_JSON= format - Add plugin metadata to each parsed record - Detect background hooks with .bg. suffix in filename - Add find_binary_for_cmd() helper function - Add create_model_record() for processing side-effect records Phase 6: Update ArchiveResult.run() - Handle background hooks (return immediately when result is None) - Process 'records' from HookResult for side-effect models - Use new output fields (output_str, output_json, output_files, etc.) - Call create_model_record() for InstalledBinary, Machine updates Phase 7: Add background hook support - Add is_background_hook() method to ArchiveResult - Add check_background_completed() to check if process exited - Add finalize_background_hook() to collect results from completed hooks - Update SnapshotMachine.is_finished() to check/finalize background hooks - Update _populate_output_fields() to walk directory and populate stats Also updated references to old 'output' field in: - admin_archiveresults.py - statemachines.py - templatetags/core_tags.py
2025-12-28 14:49:55 +00:00 · 2025-12-27 08:38:49 +00:00 · 2025-12-27 08:38:49 +00:00 · 3d985fa8c8
commit 3d985fa8c8
parent 35dd9acafe
7 changed files with 633 additions and 55 deletions
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@ -47,7 +47,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
        end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'

        # Truncate output for display
-        full_output = result.output or '-'
+        full_output = result.output_str or '-'
        output_display = full_output[:60]
        if len(full_output) > 60:
            output_display += '...'
@ -55,8 +55,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
        # Get full command as tooltip
        cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')

-        # Build output link
-        output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
+        # Build output link - use embed_path() which checks output_files first
+        embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
+        output_link = f'/archive/{result.snapshot.timestamp}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'

        # Get version - try cmd_version field
        version = result.cmd_version if result.cmd_version else '-'
@ -336,27 +337,29 @@ class ArchiveResultAdmin(BaseModelAdmin):
            ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
        )

-    def output_str(self, result):
-        # Determine output link path - use output if file exists, otherwise link to index
-        output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
+    def output_display(self, result):
+        # Determine output link path - use embed_path() which checks output_files
+        embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
+        output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
        return format_html(
            '<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
            result.snapshot.timestamp,
            output_path,
-            result.output,
+            result.output_str,
        )

    def output_summary(self, result):
        snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
-        output_str = format_html(
+        output_html = format_html(
            '<pre style="display: inline-block">{}</pre><br/>',
-            result.output,
+            result.output_str,
        )
-        output_str += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
-        path_from_output_str = (snapshot_dir / (result.output or ''))
-        output_str += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(result.output))
-        if os.access(path_from_output_str, os.R_OK):
-            root_dir = str(path_from_output_str)
+        output_html += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
+        embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
+        path_from_embed = (snapshot_dir / (embed_path or ''))
+        output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))
+        if os.access(path_from_embed, os.R_OK):
+            root_dir = str(path_from_embed)
        else:
            root_dir = str(snapshot_dir)

--- a/archivebox/core/migrations/0029_archiveresult_hook_fields.py
+++ b/archivebox/core/migrations/0029_archiveresult_hook_fields.py
@ -0,0 +1,80 @@
+# Generated by Django for hook architecture support
+# Phase 1: Add new ArchiveResult fields for hook output
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0028_snapshot_fs_version'),
+        ('machine', '0002_rename_custom_cmds_to_overrides'),
+    ]
+
+    operations = [
+        # Add new output fields (keep old 'output' temporarily for migration)
+        migrations.AddField(
+            model_name='archiveresult',
+            name='output_str',
+            field=models.TextField(
+                blank=True,
+                default='',
+                help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
+            ),
+        ),
+
+        migrations.AddField(
+            model_name='archiveresult',
+            name='output_json',
+            field=models.JSONField(
+                null=True,
+                blank=True,
+                default=None,
+                help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
+            ),
+        ),
+
+        migrations.AddField(
+            model_name='archiveresult',
+            name='output_files',
+            field=models.JSONField(
+                default=dict,
+                help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
+            ),
+        ),
+
+        migrations.AddField(
+            model_name='archiveresult',
+            name='output_size',
+            field=models.BigIntegerField(
+                default=0,
+                help_text='Total recursive size in bytes of all output files'
+            ),
+        ),
+
+        migrations.AddField(
+            model_name='archiveresult',
+            name='output_mimetypes',
+            field=models.CharField(
+                max_length=512,
+                blank=True,
+                default='',
+                help_text='CSV of mimetypes sorted by size descending'
+            ),
+        ),
+
+        # Add binary FK (optional)
+        migrations.AddField(
+            model_name='archiveresult',
+            name='binary',
+            field=models.ForeignKey(
+                'machine.InstalledBinary',
+                on_delete=models.SET_NULL,
+                null=True,
+                blank=True,
+                related_name='archiveresults',
+                help_text='Primary binary used by this hook (optional)'
+            ),
+        ),
+    ]
--- a/archivebox/core/migrations/0030_migrate_output_field.py
+++ b/archivebox/core/migrations/0030_migrate_output_field.py
@ -0,0 +1,64 @@
+# Generated by Django for hook architecture support
+# Phase 1: Migrate existing 'output' field to new split fields
+
+from django.db import migrations
+import json
+
+
+def migrate_output_field(apps, schema_editor):
+    """
+    Migrate existing 'output' field to new split fields.
+
+    Logic:
+    - If output contains JSON {...}, move to output_json
+    - Otherwise, move to output_str
+    """
+    ArchiveResult = apps.get_model('core', 'ArchiveResult')
+
+    for ar in ArchiveResult.objects.all().iterator():
+        old_output = ar.output or ''
+
+        # Case 1: JSON output
+        if old_output.strip().startswith('{'):
+            try:
+                parsed = json.loads(old_output)
+                ar.output_json = parsed
+                ar.output_str = ''
+            except json.JSONDecodeError:
+                # Not valid JSON, treat as string
+                ar.output_str = old_output
+
+        # Case 2: File path or plain string
+        else:
+            ar.output_str = old_output
+
+        ar.save(update_fields=['output_str', 'output_json'])
+
+
+def reverse_migrate(apps, schema_editor):
+    """Reverse migration - copy output_str back to output."""
+    ArchiveResult = apps.get_model('core', 'ArchiveResult')
+
+    for ar in ArchiveResult.objects.all().iterator():
+        if ar.output_json:
+            ar.output = json.dumps(ar.output_json)
+        else:
+            ar.output = ar.output_str or ''
+        ar.save(update_fields=['output'])
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0029_archiveresult_hook_fields'),
+    ]
+
+    operations = [
+        migrations.RunPython(migrate_output_field, reverse_migrate),
+
+        # Now safe to remove old 'output' field
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='output',
+        ),
+    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -36,7 +36,7 @@ from archivebox.base_models.models import (
 from workers.models import ModelWithStateMachine
 from workers.tasks import bg_archive_snapshot
 from crawls.models import Crawl
-from machine.models import NetworkInterface
+from machine.models import NetworkInterface, InstalledBinary



@ -485,9 +485,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        def calc_icons():
            if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
-                archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
+                archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
            else:
-                archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
+                # Filter for results that have either output_files or output_str
+                from django.db.models import Q
+                archive_results = {r.extractor: r for r in self.archiveresult_set.filter(
+                    Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
+                )}

            path = self.archive_path
            canon = self.canonical_outputs()
@ -499,7 +503,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

            for extractor in all_extractors:
                result = archive_results.get(extractor)
-                existing = result and result.status == 'succeeded' and result.output
+                existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
                icon = get_extractor_icon(extractor)
                output += format_html(
                    output_template,
@ -825,17 +829,24 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        # Scan each ArchiveResult's output directory for the best file
        snap_dir = Path(self.output_dir)
        for result in self.archiveresult_set.filter(status='succeeded'):
-            if not result.output:
+            if not result.output_files and not result.output_str:
                continue

            # Try to find the best output file for this extractor
            extractor_dir = snap_dir / result.extractor
            best_output = None

-            if result.output and (snap_dir / result.output).exists():
-                # Use the explicit output path if it exists
-                best_output = result.output
-            elif extractor_dir.exists():
+            # Check output_files first (new field)
+            if result.output_files:
+                first_file = next(iter(result.output_files.keys()), None)
+                if first_file and (extractor_dir / first_file).exists():
+                    best_output = f'{result.extractor}/{first_file}'
+
+            # Fallback to output_str if it looks like a path
+            if not best_output and result.output_str and (snap_dir / result.output_str).exists():
+                best_output = result.output_str
+
+            if not best_output and extractor_dir.exists():
                # Intelligently find the best file in the extractor's directory
                best_output = find_best_output_in_dir(extractor_dir, result.extractor)

@ -873,14 +884,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
        """Get the latest output that each archive method produced"""
        from archivebox.hooks import get_extractors
+        from django.db.models import Q

        latest: Dict[str, Any] = {}
        for archive_method in get_extractors():
            results = self.archiveresult_set.filter(extractor=archive_method)
            if status is not None:
                results = results.filter(status=status)
-            results = results.filter(output__isnull=False).order_by('-start_ts')
-            latest[archive_method] = results.first().output if results.exists() else None
+            # Filter for results with output_files or output_str
+            results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
+            result = results.first()
+            # Return embed_path() for backwards compatibility
+            latest[archive_method] = result.embed_path() if result else None
        return latest

    # =========================================================================
@ -1021,7 +1036,23 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
    cmd = models.JSONField(default=None, null=True, blank=True)
    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
-    output = models.CharField(max_length=1024, default=None, null=True, blank=True)
+
+    # New output fields (replacing old 'output' field)
+    output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
+    output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
+    output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
+    output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
+    output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
+
+    # Binary FK (optional - set when hook reports cmd)
+    binary = models.ForeignKey(
+        'machine.InstalledBinary',
+        on_delete=models.SET_NULL,
+        null=True, blank=True,
+        related_name='archiveresults',
+        help_text='Primary binary used by this hook'
+    )
+
    start_ts = models.DateTimeField(default=None, null=True, blank=True)
    end_ts = models.DateTimeField(default=None, null=True, blank=True)

@ -1094,11 +1125,19 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        """
        Get the relative path to the embeddable output file for this result.

-        Returns the output field if set and file exists, otherwise tries to
+        Returns the first file from output_files if set, otherwise tries to
        find a reasonable default based on the extractor type.
        """
-        if self.output:
-            return self.output
+        # Check output_files dict for primary output
+        if self.output_files:
+            # Return first file from output_files (dict preserves insertion order)
+            first_file = next(iter(self.output_files.keys()), None)
+            if first_file:
+                return f'{self.extractor}/{first_file}'
+
+        # Fallback: check output_str if it looks like a file path
+        if self.output_str and ('/' in self.output_str or '.' in self.output_str):
+            return self.output_str

        # Try to find output file based on extractor's canonical output path
        canonical = self.snapshot.canonical_outputs()
@ -1149,7 +1188,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi

        if not hook:
            self.status = self.StatusChoices.FAILED
-            self.output = f'No hook found for: {self.extractor}'
+            self.output_str = f'No hook found for: {self.extractor}'
            self.retry_at = None
            self.save()
            return
@ -1167,8 +1206,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            url=self.snapshot.url,
            snapshot_id=str(self.snapshot.id),
        )
+
+        # BACKGROUND HOOK - still running, return immediately
+        if result is None:
+            self.status = self.StatusChoices.STARTED
+            self.start_ts = start_ts
+            self.pwd = str(extractor_dir)
+            self.save()
+            return
+
        end_ts = timezone.now()

+        # Get records from hook output (new JSONL format)
+        records = result.get('records', [])
+
        # Clean up empty output directory if no files were created
        output_files = result.get('output_files', [])
        if not output_files and extractor_dir.exists():
@ -1179,14 +1230,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            except (OSError, RuntimeError):
                pass  # Directory not empty or can't be removed, that's fine

-        # Determine status from return code and JSON output
+        # Find the ArchiveResult record from hook output (if any)
+        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
        output_json = result.get('output_json') or {}
-        json_status = output_json.get('status')

-        if json_status == 'skipped':
-            status = 'skipped'
-        elif json_status == 'failed':
-            status = 'failed'
+        # Determine status from records, output_json, or return code
+        if ar_records:
+            # Use status from first ArchiveResult record
+            hook_data = ar_records[0]
+            status = hook_data.get('status', 'failed')
+        elif output_json.get('status'):
+            status = output_json['status']
        elif result['returncode'] == 0:
            status = 'succeeded'
        else:
@ -1199,20 +1253,45 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            'skipped': self.StatusChoices.SKIPPED,
        }
        self.status = status_map.get(status, self.StatusChoices.FAILED)
-        self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None
+
+        # Set output fields from records or output_json
+        if ar_records:
+            hook_data = ar_records[0]
+            self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
+            self.output_json = hook_data.get('output_json')
+            # Set cmd from JSONL record
+            if hook_data.get('cmd'):
+                self.cmd = hook_data['cmd']
+                self._set_binary_from_cmd(hook_data['cmd'])
+            if hook_data.get('cmd_version'):
+                self.cmd_version = hook_data['cmd_version'][:128]
+        else:
+            # Fallback to legacy output_json format
+            self.output_str = output_json.get('output_str') or output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or ''
+            self.output_json = output_json.get('output_json') if output_json.get('output_json') else None
+            if output_json.get('cmd_version'):
+                self.cmd_version = output_json['cmd_version'][:128]
+            if output_json.get('cmd'):
+                self.cmd = output_json['cmd']
+                self._set_binary_from_cmd(output_json['cmd'])
+
        self.start_ts = start_ts
        self.end_ts = end_ts
        self.retry_at = None
        self.pwd = str(extractor_dir)

-        # Save cmd and cmd_version from extractor output
-        if output_json.get('cmd_version'):
-            self.cmd_version = output_json['cmd_version'][:128]  # Max length from model
-        if output_json.get('cmd'):
-            self.cmd = output_json['cmd']
+        # Populate output_files, output_size, output_mimetypes from filesystem
+        if extractor_dir.exists():
+            self._populate_output_fields(extractor_dir)

        self.save()

+        # Process side-effect records (InstalledBinary, Machine config, etc.)
+        from archivebox.hooks import create_model_record
+        for record in records:
+            if record.get('type') != 'ArchiveResult':
+                create_model_record(record.copy())  # Copy to avoid mutating original
+
        # Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
        self._queue_urls_for_crawl(extractor_dir)

@ -1226,6 +1305,84 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        if self.status == self.StatusChoices.SUCCEEDED:
            self.trigger_search_indexing()

+    def _populate_output_fields(self, output_dir: Path) -> None:
+        """
+        Walk output directory and populate output_files, output_size, output_mimetypes.
+        """
+        import mimetypes
+        from collections import defaultdict
+
+        exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
+
+        # Track mimetypes and sizes for aggregation
+        mime_sizes = defaultdict(int)
+        total_size = 0
+        output_files = {}  # Dict keyed by relative path
+
+        for file_path in output_dir.rglob('*'):
+            # Skip non-files and infrastructure files
+            if not file_path.is_file():
+                continue
+            if file_path.name in exclude_names:
+                continue
+
+            # Get file stats
+            try:
+                stat = file_path.stat()
+                mime_type, _ = mimetypes.guess_type(str(file_path))
+                mime_type = mime_type or 'application/octet-stream'
+
+                # Track for ArchiveResult fields
+                relative_path = str(file_path.relative_to(output_dir))
+                output_files[relative_path] = {}  # Empty dict, extensible for future metadata
+                mime_sizes[mime_type] += stat.st_size
+                total_size += stat.st_size
+            except (OSError, IOError):
+                continue
+
+        # Populate ArchiveResult fields
+        self.output_files = output_files
+        self.output_size = total_size
+
+        # Build output_mimetypes CSV (sorted by size descending)
+        sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
+        self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
+
+    def _set_binary_from_cmd(self, cmd: list) -> None:
+        """
+        Find InstalledBinary for command and set binary FK.
+
+        Tries matching by absolute path first, then by binary name.
+        Only matches binaries on the current machine.
+        """
+        if not cmd:
+            return
+
+        from machine.models import Machine
+
+        bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
+        machine = Machine.current()
+
+        # Try matching by absolute path first
+        binary = InstalledBinary.objects.filter(
+            abspath=bin_path_or_name,
+            machine=machine
+        ).first()
+
+        if binary:
+            self.binary = binary
+            return
+
+        # Fallback: match by binary name
+        bin_name = Path(bin_path_or_name).name
+        binary = InstalledBinary.objects.filter(
+            name=bin_name,
+            machine=machine
+        ).first()
+
+        if binary:
+            self.binary = binary
+
    def _update_snapshot_title(self, extractor_dir: Path):
        """
        Update snapshot title from title extractor output.
@ -1325,3 +1482,120 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    def output_dir(self) -> Path:
        """Get the output directory for this extractor's results."""
        return Path(self.snapshot.output_dir) / self.extractor
+
+    def is_background_hook(self) -> bool:
+        """Check if this ArchiveResult is for a background hook."""
+        extractor_dir = Path(self.pwd) if self.pwd else None
+        if not extractor_dir:
+            return False
+        pid_file = extractor_dir / 'hook.pid'
+        return pid_file.exists()
+
+    def check_background_completed(self) -> bool:
+        """
+        Check if background hook process has exited.
+
+        Returns:
+            True if completed (process exited), False if still running
+        """
+        extractor_dir = Path(self.pwd) if self.pwd else None
+        if not extractor_dir:
+            return True  # No pwd = completed or failed to start
+
+        pid_file = extractor_dir / 'hook.pid'
+        if not pid_file.exists():
+            return True  # No PID file = completed or failed to start
+
+        try:
+            pid = int(pid_file.read_text().strip())
+            os.kill(pid, 0)  # Signal 0 = check if process exists
+            return False  # Still running
+        except (OSError, ValueError):
+            return True  # Process exited or invalid PID
+
+    def finalize_background_hook(self) -> None:
+        """
+        Collect final results from completed background hook.
+
+        Same logic as run() but for background hooks that already started.
+        """
+        from archivebox.hooks import create_model_record
+
+        extractor_dir = Path(self.pwd) if self.pwd else None
+        if not extractor_dir or not extractor_dir.exists():
+            self.status = self.StatusChoices.FAILED
+            self.output_str = 'Background hook output directory not found'
+            self.end_ts = timezone.now()
+            self.retry_at = None
+            self.save()
+            return
+
+        stdout_file = extractor_dir / 'stdout.log'
+        stderr_file = extractor_dir / 'stderr.log'
+
+        # Read logs
+        stdout = stdout_file.read_text() if stdout_file.exists() else ''
+
+        # Parse JSONL output
+        records = []
+        for line in stdout.splitlines():
+            line = line.strip()
+            if not line or not line.startswith('{'):
+                continue
+            try:
+                data = json.loads(line)
+                if 'type' in data:
+                    records.append(data)
+            except json.JSONDecodeError:
+                continue
+
+        # Find the ArchiveResult record
+        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
+
+        if ar_records:
+            hook_data = ar_records[0]
+
+            # Apply hook's data
+            status_str = hook_data.get('status', 'failed')
+            status_map = {
+                'succeeded': self.StatusChoices.SUCCEEDED,
+                'failed': self.StatusChoices.FAILED,
+                'skipped': self.StatusChoices.SKIPPED,
+            }
+            self.status = status_map.get(status_str, self.StatusChoices.FAILED)
+
+            self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
+            self.output_json = hook_data.get('output_json')
+
+            # Determine binary FK from cmd
+            if hook_data.get('cmd'):
+                self.cmd = hook_data['cmd']
+                self._set_binary_from_cmd(hook_data['cmd'])
+            if hook_data.get('cmd_version'):
+                self.cmd_version = hook_data['cmd_version'][:128]
+        else:
+            # No output = failed
+            self.status = self.StatusChoices.FAILED
+            self.output_str = 'Background hook did not output ArchiveResult'
+
+        self.end_ts = timezone.now()
+        self.retry_at = None
+
+        # Populate output fields from filesystem
+        if extractor_dir.exists():
+            self._populate_output_fields(extractor_dir)
+
+        self.save()
+
+        # Create any side-effect records
+        for record in records:
+            if record.get('type') != 'ArchiveResult':
+                create_model_record(record.copy())
+
+        # Cleanup PID files and empty logs
+        pid_file = extractor_dir / 'hook.pid'
+        pid_file.unlink(missing_ok=True)
+        if stdout_file.exists() and stdout_file.stat().st_size == 0:
+            stdout_file.unlink()
+        if stderr_file.exists() and stderr_file.stat().st_size == 0:
+            stderr_file.unlink()
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@ -59,11 +59,22 @@ class SnapshotMachine(StateMachine, strict_states=True):
        # if no archiveresults exist yet, it's not finished
        if not self.snapshot.archiveresult_set.exists():
            return False
-        
+
        # if archiveresults exist but are still pending, it's not finished
        if self.snapshot.pending_archiveresults().exists():
            return False
-        
+
+        # Check for background hooks that are still running
+        started_results = self.snapshot.archiveresult_set.filter(
+            status=ArchiveResult.StatusChoices.STARTED
+        )
+        for result in started_results:
+            if not result.check_background_completed():
+                return False  # Still running
+
+            # Completed - finalize it
+            result.finalize_background_hook()
+
        # otherwise archiveresults exist and are all finished, so it's finished
        return True
        
@ -184,10 +195,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
    
    def is_backoff(self) -> bool:
        """Check if we should backoff and retry later."""
-        # Backoff if status is still started (extractor didn't complete) and output is None
+        # Backoff if status is still started (extractor didn't complete) and output_str is empty
        return (
-            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and 
-            self.archiveresult.output is None
+            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
+            not self.archiveresult.output_str
        )
    
    def is_finished(self) -> bool:
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@ -80,7 +80,7 @@ def extractor_thumbnail(context, result) -> str:
        return ''

    # Use embed_path() for the display path (includes canonical paths)
-    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')

    # Create a mini template and render it with context
    try:
@ -109,7 +109,7 @@ def extractor_embed(context, result) -> str:
    if not template_str:
        return ''

-    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')

    try:
        tpl = template.Template(template_str)
@ -137,7 +137,7 @@ def extractor_fullscreen(context, result) -> str:
    if not template_str:
        return ''

-    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')

    try:
        tpl = template.Template(template_str)
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@ -68,6 +68,8 @@ class HookResult(TypedDict, total=False):
    output_files: List[str]
    duration_ms: int
    hook: str
+    # New fields for JSONL parsing
+    records: List[Dict[str, Any]]  # Parsed JSONL records with 'type' field


 def discover_hooks(event_name: str) -> List[Path]:
@ -268,7 +270,9 @@ def run_hook(
    files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()

    # Detect if this is a background hook (long-running daemon)
-    is_background = '__background' in script.stem
+    # New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
+    # Old convention: __background in stem (for backwards compatibility)
+    is_background = '.bg.' in script.name or '__background' in script.stem

    # Set up output files for ALL hooks (useful for debugging)
    stdout_file = output_dir / 'stdout.log'
@ -322,13 +326,44 @@ def run_hook(
        # Exclude the log files themselves from new_files
        new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]

-        # Parse RESULT_JSON from stdout
+        # Parse JSONL output from stdout
+        # Supports both new JSONL format (any line starting with { that has 'type')
+        # and legacy RESULT_JSON= format for backwards compatibility
        output_json = None
+        records = []
+        plugin_name = script.parent.name  # Plugin directory name (e.g., 'wget')
+
        for line in stdout.splitlines():
-            if line.startswith('RESULT_JSON='):
+            line = line.strip()
+            if not line:
+                continue
+
+            # New JSONL format: any line starting with { that has 'type' field
+            if line.startswith('{'):
                try:
-                    output_json = json.loads(line[len('RESULT_JSON='):])
-                    break
+                    data = json.loads(line)
+                    if 'type' in data:
+                        # Add plugin metadata to every record
+                        data['plugin'] = plugin_name
+                        data['plugin_hook'] = str(script)
+                        records.append(data)
+                        # For backwards compatibility, also set output_json for first ArchiveResult
+                        if data.get('type') == 'ArchiveResult' and output_json is None:
+                            output_json = data
+                except json.JSONDecodeError:
+                    pass
+
+            # Legacy format: RESULT_JSON=...
+            elif line.startswith('RESULT_JSON='):
+                try:
+                    data = json.loads(line[len('RESULT_JSON='):])
+                    if output_json is None:
+                        output_json = data
+                    # Convert legacy format to new format
+                    data['type'] = 'ArchiveResult'
+                    data['plugin'] = plugin_name
+                    data['plugin_hook'] = str(script)
+                    records.append(data)
                except json.JSONDecodeError:
                    pass

@ -348,6 +383,7 @@ def run_hook(
            output_files=new_files,
            duration_ms=duration_ms,
            hook=str(script),
+            records=records,
        )

    except Exception as e:
@ -360,6 +396,7 @@ def run_hook(
            output_files=[],
            duration_ms=duration_ms,
            hook=str(script),
+            records=[],
        )


@ -1104,3 +1141,112 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
    return templates


+# =============================================================================
+# Hook Result Processing Helpers
+# =============================================================================
+
+
+def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
+    """
+    Find InstalledBinary for a command, trying abspath first then name.
+    Only matches binaries on the current machine.
+
+    Args:
+        cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url'])
+        machine_id: Current machine ID
+
+    Returns:
+        Binary ID as string if found, None otherwise
+    """
+    if not cmd:
+        return None
+
+    from machine.models import InstalledBinary
+
+    bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
+
+    # Try matching by absolute path first
+    binary = InstalledBinary.objects.filter(
+        abspath=bin_path_or_name,
+        machine_id=machine_id
+    ).first()
+
+    if binary:
+        return str(binary.id)
+
+    # Fallback: match by binary name
+    bin_name = Path(bin_path_or_name).name
+    binary = InstalledBinary.objects.filter(
+        name=bin_name,
+        machine_id=machine_id
+    ).first()
+
+    return str(binary.id) if binary else None
+
+
+def create_model_record(record: Dict[str, Any]) -> Any:
+    """
+    Generic helper to create/update model instances from hook JSONL output.
+
+    Args:
+        record: Dict with 'type' field and model data
+
+    Returns:
+        Created/updated model instance, or None if type unknown
+    """
+    from machine.models import InstalledBinary, Machine
+
+    record_type = record.pop('type', None)
+    if not record_type:
+        return None
+
+    # Remove plugin metadata (not model fields)
+    record.pop('plugin', None)
+    record.pop('plugin_hook', None)
+
+    if record_type == 'InstalledBinary':
+        # InstalledBinary requires machine FK
+        machine = Machine.current()
+        record.setdefault('machine', machine)
+
+        # Required fields check
+        name = record.get('name')
+        abspath = record.get('abspath')
+        if not name or not abspath:
+            return None
+
+        obj, created = InstalledBinary.objects.update_or_create(
+            machine=machine,
+            name=name,
+            defaults={
+                'abspath': abspath,
+                'version': record.get('version', ''),
+                'sha256': record.get('sha256', ''),
+                'binprovider': record.get('binprovider', 'env'),
+            }
+        )
+        return obj
+
+    elif record_type == 'Machine':
+        # Machine config update (special _method handling)
+        method = record.pop('_method', None)
+        if method == 'update':
+            key = record.get('key')
+            value = record.get('value')
+            if key and value:
+                machine = Machine.current()
+                if not machine.config:
+                    machine.config = {}
+                machine.config[key] = value
+                machine.save(update_fields=['config'])
+                return machine
+        return None
+
+    # Add more types as needed (Dependency, Snapshot, etc.)
+    else:
+        # Unknown type - log warning but don't fail
+        import sys
+        print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
+        return None
+
+