From cffbef84eddcfacbb702307ec181fc4ac01639ae Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 27 Dec 2025 00:33:51 -0800 Subject: [PATCH] make Claude.md stricter and improve migration tests --- CLAUDE.md | 16 +++--- archivebox/tests/test_migrations_08_to_09.py | 51 +++++++++++--------- archivebox/tests/test_migrations_helpers.py | 46 ++++++++++-------- 3 files changed, 63 insertions(+), 50 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index b8af1059..8dcc1e8b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -3,10 +3,10 @@ ## Quick Start ```bash -# Set up dev environment -uv sync --dev +# Set up dev environment (always use uv, never pip directly) +uv sync --dev --all-extras -# Run tests as non-root user (required - ArchiveBox refuses to run as root) +# Run tests as non-root user (required - ArchiveBox always refuses to run as root) sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/ -v' ``` @@ -19,7 +19,7 @@ sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archiveb ### Install Dependencies ```bash -uv sync --dev +uv sync --dev --all-extras # Always use uv, never pip directly ``` ### Activate Virtual Environment @@ -30,7 +30,7 @@ source .venv/bin/activate ## Running Tests ### CRITICAL: Never Run as Root -ArchiveBox has a root check that prevents running as root user. Always run tests as a non-root user: +ArchiveBox has a root check that prevents running as root user. All ArchiveBox commands (including tests) must run as non-root user inside a data directory: ```bash # Run all migration tests @@ -62,8 +62,10 @@ Tests must exercise real code paths: - Run actual `python -m archivebox` commands via subprocess - Query SQLite directly to verify results +**If something is hard to test**: Modify the implementation to make it easier to test, or fix the underlying issue. Never mock, skip, simulate, or exit early from a test because you can't get something working inside the test. + ### NO SKIPS -Never use `@skip`, `skipTest`, or `pytest.mark.skip`. Every test must run. +Never use `@skip`, `skipTest`, or `pytest.mark.skip`. Every test must run. If a test is difficult, fix the code or test environment - don't disable the test. ### Strict Assertions - `init` command must return exit code 0 (not `[0, 1]`) @@ -115,7 +117,7 @@ chmod 644 archivebox/tests/test_*.py ``` ### 2. DATA_DIR Environment Variable -Tests use temp directories. The `run_archivebox()` helper sets `DATA_DIR` automatically. +ArchiveBox commands must run inside a data directory. Tests use temp directories - the `run_archivebox()` helper sets `DATA_DIR` automatically. ### 3. Extractors Disabled for Speed Tests disable all extractors via environment variables for faster execution: diff --git a/archivebox/tests/test_migrations_08_to_09.py b/archivebox/tests/test_migrations_08_to_09.py index 09a1e65a..47d47cb5 100644 --- a/archivebox/tests/test_migrations_08_to_09.py +++ b/archivebox/tests/test_migrations_08_to_09.py @@ -12,6 +12,7 @@ Migration tests from 0.8.x to 0.9.x. import shutil import sqlite3 +import subprocess import tempfile import unittest from pathlib import Path @@ -440,28 +441,34 @@ class TestFilesystemMigration08to09(unittest.TestCase): result = run_archivebox(self.work_dir, ['init'], timeout=45) self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") - # Step 2: Archive example.com with some extractors enabled - # Enable a subset of fast extractors for testing - result = run_archivebox( - self.work_dir, - ['add', '--depth=0', 'https://example.com'], - timeout=120, - env={ - 'SAVE_TITLE': 'True', - 'SAVE_FAVICON': 'True', - 'SAVE_WGET': 'True', - 'SAVE_SCREENSHOT': 'False', # Disable slow extractors - 'SAVE_DOM': 'False', - 'SAVE_SINGLEFILE': 'False', - 'SAVE_READABILITY': 'False', - 'SAVE_MERCURY': 'False', - 'SAVE_PDF': 'False', - 'SAVE_MEDIA': 'False', - 'SAVE_ARCHIVE_DOT_ORG': 'False', - } - ) - # Note: Add may fail if network is down or extractors fail, but we still want to test - # the filesystem migration logic even with partial failures + # Step 2: Archive example.com with ALL extractors enabled + # This ensures we test migration with all file types + try: + result = run_archivebox( + self.work_dir, + ['add', '--depth=0', 'https://example.com'], + timeout=300, # 5 minutes for all extractors + env={ + 'SAVE_TITLE': 'True', + 'SAVE_FAVICON': 'True', + 'SAVE_WGET': 'True', + 'SAVE_SCREENSHOT': 'True', + 'SAVE_DOM': 'True', + 'SAVE_SINGLEFILE': 'True', + 'SAVE_READABILITY': 'True', + 'SAVE_MERCURY': 'True', + 'SAVE_PDF': 'True', + 'SAVE_MEDIA': 'True', + 'SAVE_ARCHIVE_DOT_ORG': 'True', + 'SAVE_HEADERS': 'True', + 'SAVE_HTMLTOTEXT': 'True', + 'SAVE_GIT': 'True', + } + ) + except subprocess.TimeoutExpired as e: + # If timeout, still continue - we want to test with whatever files were created + print(f"\n[!] Add command timed out after {e.timeout}s, continuing with partial results...") + # Note: Snapshot may still have been created even if command timed out # Step 3: Get the snapshot and verify files were created conn = sqlite3.connect(str(self.db_path)) diff --git a/archivebox/tests/test_migrations_helpers.py b/archivebox/tests/test_migrations_helpers.py index d2bf17aa..debaf5d1 100644 --- a/archivebox/tests/test_migrations_helpers.py +++ b/archivebox/tests/test_migrations_helpers.py @@ -986,27 +986,31 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]: # Helper Functions # ============================================================================= -def run_archivebox(data_dir: Path, args: list, timeout: int = 60) -> subprocess.CompletedProcess: +def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict = None) -> subprocess.CompletedProcess: """Run archivebox command in subprocess with given data directory.""" - env = os.environ.copy() - env['DATA_DIR'] = str(data_dir) - env['USE_COLOR'] = 'False' - env['SHOW_PROGRESS'] = 'False' - # Disable ALL extractors for faster tests - env['SAVE_ARCHIVE_DOT_ORG'] = 'False' - env['SAVE_TITLE'] = 'False' - env['SAVE_FAVICON'] = 'False' - env['SAVE_WGET'] = 'False' - env['SAVE_SINGLEFILE'] = 'False' - env['SAVE_SCREENSHOT'] = 'False' - env['SAVE_PDF'] = 'False' - env['SAVE_DOM'] = 'False' - env['SAVE_READABILITY'] = 'False' - env['SAVE_MERCURY'] = 'False' - env['SAVE_GIT'] = 'False' - env['SAVE_MEDIA'] = 'False' - env['SAVE_HEADERS'] = 'False' - env['SAVE_HTMLTOTEXT'] = 'False' + base_env = os.environ.copy() + base_env['DATA_DIR'] = str(data_dir) + base_env['USE_COLOR'] = 'False' + base_env['SHOW_PROGRESS'] = 'False' + # Disable ALL extractors for faster tests (can be overridden by env parameter) + base_env['SAVE_ARCHIVE_DOT_ORG'] = 'False' + base_env['SAVE_TITLE'] = 'False' + base_env['SAVE_FAVICON'] = 'False' + base_env['SAVE_WGET'] = 'False' + base_env['SAVE_SINGLEFILE'] = 'False' + base_env['SAVE_SCREENSHOT'] = 'False' + base_env['SAVE_PDF'] = 'False' + base_env['SAVE_DOM'] = 'False' + base_env['SAVE_READABILITY'] = 'False' + base_env['SAVE_MERCURY'] = 'False' + base_env['SAVE_GIT'] = 'False' + base_env['SAVE_MEDIA'] = 'False' + base_env['SAVE_HEADERS'] = 'False' + base_env['SAVE_HTMLTOTEXT'] = 'False' + + # Override with any custom env vars + if env: + base_env.update(env) cmd = [sys.executable, '-m', 'archivebox'] + args @@ -1014,7 +1018,7 @@ def run_archivebox(data_dir: Path, args: list, timeout: int = 60) -> subprocess. cmd, capture_output=True, text=True, - env=env, + env=base_env, cwd=str(data_dir), timeout=timeout, )