ArchiveBox/archivebox/core/admin_archiveresults.py
Claude b632894bc9
Update views, API, and exports for new ArchiveResult output fields
Replace old `output` field with new fields across the codebase:
- output_str: Human-readable output summary
- output_json: Structured metadata (optional)
- output_files: Dict of output files with metadata
- output_size: Total size in bytes
- output_mimetypes: CSV of file mimetypes

Files updated:
- api/v1_core.py: Update MinimalArchiveResultSchema to expose new fields
- api/v1_core.py: Update ArchiveResultFilterSchema to search output_str
- cli/archivebox_extract.py: Use output_str in CLI output
- core/admin_archiveresults.py: Update admin fields, search, and fieldsets
- core/admin_archiveresults.py: Fix output_html variable name bug in output_summary
- misc/jsonl.py: Update archiveresult_to_jsonl() to include new fields
- plugins/extractor_utils.py: Update ExtractorResult helper class

The embed_path() method already uses output_files and output_str,
so snapshot detail page and template tags work correctly.
2025-12-27 20:28:22 +00:00

386 lines
19 KiB
Python

__package__ = 'archivebox.core'
import os
from pathlib import Path
from django.contrib import admin
from django.utils.html import format_html, mark_safe
from django.core.exceptions import ValidationError
from django.urls import reverse, resolve
from django.utils import timezone
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_extractor_icon
from core.models import ArchiveResult, Snapshot
def render_archiveresults_list(archiveresults_qs, limit=50):
"""Render a nice inline list view of archive results with status, extractor, output, and actions."""
results = list(archiveresults_qs.order_by('extractor').select_related('snapshot')[:limit])
if not results:
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
# Status colors
status_colors = {
'succeeded': ('#166534', '#dcfce7'), # green
'failed': ('#991b1b', '#fee2e2'), # red
'queued': ('#6b7280', '#f3f4f6'), # gray
'started': ('#92400e', '#fef3c7'), # amber
}
rows = []
for idx, result in enumerate(results):
status = result.status or 'queued'
color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
# Get extractor icon
icon = get_extractor_icon(result.extractor)
# Format timestamp
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
# Truncate output for display
full_output = result.output_str or '-'
output_display = full_output[:60]
if len(full_output) > 60:
output_display += '...'
# Get full command as tooltip
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
# Build output link - use embed_path() which checks output_files first
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_link = f'/archive/{result.snapshot.timestamp}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
# Get version - try cmd_version field
version = result.cmd_version if result.cmd_version else '-'
# Unique ID for this row's expandable output
row_id = f'output_{idx}_{str(result.id)[:8]}'
rows.append(f'''
<tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
<td style="padding: 10px 12px; white-space: nowrap;">
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;"
title="View/edit archive result">
<code>{str(result.id)[:8]}</code>
</a>
</td>
<td style="padding: 10px 12px; white-space: nowrap;">
<span style="display: inline-block; padding: 3px 10px; border-radius: 12px;
font-size: 11px; font-weight: 600; text-transform: uppercase;
color: {color}; background: {bg};">{status}</span>
</td>
<td style="padding: 10px 12px; white-space: nowrap; font-size: 20px;" title="{result.extractor}">
{icon}
</td>
<td style="padding: 10px 12px; font-weight: 500; color: #334155;">
<a href="{output_link}" target="_blank"
style="color: #334155; text-decoration: none;"
title="View output fullscreen"
onmouseover="this.style.color='#2563eb'; this.style.textDecoration='underline';"
onmouseout="this.style.color='#334155'; this.style.textDecoration='none';">
{result.extractor}
</a>
</td>
<td style="padding: 10px 12px; max-width: 280px;">
<span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open"
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px; cursor: pointer;"
title="Click to expand full output">
{output_display}
</span>
</td>
<td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;">
{end_time}
</td>
<td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;">
{version}
</td>
<td style="padding: 10px 8px; white-space: nowrap;">
<div style="display: flex; gap: 4px;">
<a href="{output_link}" target="_blank"
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
title="View output">📄</a>
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
title="Edit">✏️</a>
</div>
</td>
</tr>
<tr style="border-bottom: 1px solid #e2e8f0;">
<td colspan="8" style="padding: 0 12px 10px 12px;">
<details id="{row_id}" style="margin: 0;">
<summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
Details &amp; Output
</summary>
<div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;">
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)}</code></span>
<span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
</div>
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
<b>Output:</b>
</div>
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 12px; white-space: pre-wrap; word-break: break-all; max-height: 120px; overflow: auto;">{full_output}</pre>
<div style="font-size: 11px; color: #64748b; margin-top: 8px;">
<b>Command:</b>
</div>
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
</div>
</details>
</td>
</tr>
''')
total_count = archiveresults_qs.count()
footer = ''
if total_count > limit:
footer = f'''
<tr>
<td colspan="8" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
Showing {limit} of {total_count} results &nbsp;
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
style="color: #2563eb;">View all →</a>
</td>
</tr>
'''
return mark_safe(f'''
<div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;">
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
<thead>
<tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">ID</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
<th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th>
</tr>
</thead>
<tbody>
{''.join(rows)}
{footer}
</tbody>
</table>
</div>
''')
class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log'
model = ArchiveResult
parent_model = Snapshot
# fk_name = 'snapshot'
extra = 0
sort_fields = ('end_ts', 'extractor', 'output_str', 'status', 'cmd_version')
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
fields = ('start_ts', 'end_ts', *readonly_fields, 'extractor', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str')
# exclude = ('id',)
ordering = ('end_ts',)
show_change_link = True
# # classes = ['collapse']
def get_parent_object_from_request(self, request):
resolved = resolve(request.path_info)
try:
return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
except (self.parent_model.DoesNotExist, ValidationError):
return None
@admin.display(
description='Completed',
ordering='end_ts',
)
def completed(self, obj):
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
def result_id(self, obj):
return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), str(obj.id)[:8])
def command(self, obj):
return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
def version(self, obj):
return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
def get_formset(self, request, obj=None, **kwargs):
formset = super().get_formset(request, obj, **kwargs)
snapshot = self.get_parent_object_from_request(request)
# import ipdb; ipdb.set_trace()
# formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
# default values for new entries
formset.form.base_fields['status'].initial = 'succeeded'
formset.form.base_fields['start_ts'].initial = timezone.now()
formset.form.base_fields['end_ts'].initial = timezone.now()
formset.form.base_fields['cmd_version'].initial = '-'
formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
formset.form.base_fields['created_by'].initial = request.user
formset.form.base_fields['cmd'].initial = '["-"]'
formset.form.base_fields['output_str'].initial = 'Manually recorded cmd output...'
if obj is not None:
# hidden values for existing entries and new entries
formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
return formset
def get_readonly_fields(self, request, obj=None):
if obj is not None:
return self.readonly_fields
else:
return []
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface')
search_fields = ('id', 'snapshot__url', 'extractor', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
autocomplete_fields = ['snapshot']
fieldsets = (
('Snapshot', {
'fields': ('snapshot', 'snapshot_info', 'tags_str'),
'classes': ('card', 'wide'),
}),
('Extractor', {
'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at', 'iface'),
'classes': ('card',),
}),
('Timing', {
'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Command', {
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
'classes': ('card',),
}),
('Output', {
'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
'classes': ('card', 'wide'),
}),
('Metadata', {
'fields': ('created_by',),
'classes': ('card',),
}),
)
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
ordering = ['-start_ts']
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
paginator = AccelleratedPaginator
save_on_top = True
actions = ['delete_selected']
class Meta:
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results'
def change_view(self, request, object_id, form_url="", extra_context=None):
self.request = request
return super().change_view(request, object_id, form_url, extra_context)
@admin.display(
description='Snapshot Info'
)
def snapshot_info(self, result):
return format_html(
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
result.snapshot.timestamp,
str(result.snapshot.id)[:8],
result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'),
result.snapshot.url[:128],
)
@admin.display(
description='Snapshot Tags'
)
def tags_str(self, result):
return result.snapshot.tags_str()
@admin.display(description='Extractor', ordering='extractor')
def extractor_with_icon(self, result):
icon = get_extractor_icon(result.extractor)
return format_html(
'<span title="{}">{}</span> {}',
result.extractor,
icon,
result.extractor,
)
def cmd_str(self, result):
return format_html(
'<pre>{}</pre>',
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
)
def output_display(self, result):
# Determine output link path - use embed_path() which checks output_files
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
return format_html(
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
result.snapshot.timestamp,
output_path,
result.output_str,
)
def output_summary(self, result):
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
output_html = format_html(
'<pre style="display: inline-block">{}</pre><br/>',
result.output_str,
)
output_html += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
path_from_embed = (snapshot_dir / (embed_path or ''))
output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))
if os.access(path_from_embed, os.R_OK):
root_dir = str(path_from_embed)
else:
root_dir = str(snapshot_dir)
# print(root_dir, str(list(os.walk(root_dir))))
for root, dirs, files in os.walk(root_dir):
depth = root.replace(root_dir, '').count(os.sep) + 1
if depth > 2:
continue
indent = ' ' * 4 * (depth)
output_html += format_html('<b style="padding: 1px">{}{}/</b><br/>', indent, os.path.basename(root))
indentation_str = ' ' * 4 * (depth + 1)
for filename in sorted(files):
is_hidden = filename.startswith('.')
output_html += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
return output_html + mark_safe('</code></pre>')
def register_admin(admin_site):
admin_site.register(ArchiveResult, ArchiveResultAdmin)