ArchiveBox/tests/test_list.py
Nick Sweeting 6c769d831c
wip 2
2025-12-24 21:46:14 -08:00

97 lines
5.3 KiB
Python

import json
import subprocess
from .fixtures import *
def test_search_json(process, disable_extractors_dict):
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
search_process = subprocess.run(["archivebox", "search", "--json"], capture_output=True)
output_str = search_process.stdout.decode("utf-8").strip()
# Handle potential control characters in output
try:
output_json = json.loads(output_str)
except json.JSONDecodeError:
# Try with strict=False if there are control characters
import re
# Remove ANSI escape sequences and control characters
clean_str = re.sub(r'\x1b\[[0-9;]*m', '', output_str)
clean_str = re.sub(r'[\x00-\x1f\x7f]', lambda m: ' ' if m.group(0) in '\t\n\r' else '', clean_str)
output_json = json.loads(clean_str)
# With --index-only, only source file snapshots are created (file:// URLs)
# Verify we get at least one snapshot back
assert len(output_json) >= 1
# The snapshot should be a file:// URL pointing to sources
assert any("sources" in entry.get("url", "") for entry in output_json)
def test_search_json_headers(process, disable_extractors_dict):
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
search_process = subprocess.run(["archivebox", "search", "--json", "--with-headers"], capture_output=True)
output_str = search_process.stdout.decode("utf-8").strip()
# Handle potential control characters in output
try:
output_json = json.loads(output_str)
except json.JSONDecodeError:
# Try with strict=False if there are control characters
import re
# Remove ANSI escape sequences and control characters
clean_str = re.sub(r'\x1b\[[0-9;]*m', '', output_str)
clean_str = re.sub(r'[\x00-\x1f\x7f]', lambda m: ' ' if m.group(0) in '\t\n\r' else '', clean_str)
output_json = json.loads(clean_str)
# The response should have a links key with headers mode
links = output_json.get("links", output_json)
assert len(links) >= 1
def test_search_html(process, disable_extractors_dict):
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
search_process = subprocess.run(["archivebox", "search", "--html"], capture_output=True)
output_html = search_process.stdout.decode("utf-8")
# Should contain some HTML and reference to the source file
assert "sources" in output_html or "cli_add" in output_html or "<" in output_html
def test_search_html_headers(process, disable_extractors_dict):
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
search_process = subprocess.run(["archivebox", "search", "--html", "--with-headers"], capture_output=True)
output_html = search_process.stdout.decode("utf-8")
# Should contain HTML
assert "<" in output_html
def test_search_csv(process, disable_extractors_dict):
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
search_process = subprocess.run(["archivebox", "search", "--csv", "url"], capture_output=True)
output_csv = search_process.stdout.decode("utf-8")
# Should contain the source file URL
assert "file://" in output_csv or "sources" in output_csv
def test_search_csv_headers(process, disable_extractors_dict):
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
search_process = subprocess.run(["archivebox", "search", "--csv", "url", "--with-headers"], capture_output=True)
output_csv = search_process.stdout.decode("utf-8")
# Should have url header and source file content
assert "url" in output_csv
def test_search_with_headers_requires_format(process):
search_process = subprocess.run(["archivebox", "search", "--with-headers"], capture_output=True)
stderr = search_process.stderr.decode("utf-8")
assert "--with-headers" in stderr and ("requires" in stderr or "can only be used" in stderr)
def test_sort_by_url(process, disable_extractors_dict):
# Add two URLs - they will create separate source files
subprocess.run(["archivebox", "add", "--index-only", "https://iana.org", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
# Search with sort should return results (even if they're file:// URLs)
search_process = subprocess.run(["archivebox", "search", "--csv", "url", "--sort=url"], capture_output=True)
output = search_process.stdout.decode("utf-8")
lines = [line for line in output.strip().split("\n") if line]
# Should have at least 2 snapshots (the source file snapshots)
assert len(lines) >= 2