mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-12-27 22:26:00 +00:00
6110 lines
267 KiB
XML
6110 lines
267 KiB
XML
tring';
|
||
import { Readable } from 'node:stream';
|
||
import { finished } from 'node:stream/promises';
|
||
import { URL } from 'node:url';
|
||
import util from 'node:util';
|
||
const exec = util.promisify(child_process.exec);
|
||
|
||
import { Readability } from '@mozilla/readability';
|
||
import FileCookieStore from '@root/file-cookie-store';
|
||
import merge from 'deepmerge';
|
||
import { createCursor, getRandomPagePoint } from 'ghost-cursor';
|
||
import { JSDOM, VirtualConsole } from 'jsdom';
|
||
import mime from 'mime-types';
|
||
import ToughCookie from 'tough-cookie';
|
||
import unzip from 'unzip-crx-3';
|
||
|
||
import puppeteer from 'puppeteer';
|
||
import { Browser, Page, Cookie, HTTPResponse } from 'puppeteer';
|
||
import { Cluster } from 'puppeteer-cluster';
|
||
import PupeteerExtra from "puppeteer-extra";
|
||
import Stealth#!/usr/bin/env node --env-file .env
|
||
// https://gist.github.com/pirate/d9a350e83025a1e6cf452cddd815d0d4
|
||
|
||
// npm install request node-request minimist deepmerge mime-types decompress puppeteer-extra puppeteer-extra-plugin-repl puppeteer-extra-plugin-user-preferences puppeteer-extra-plugin-recaptcha puppeteer-extra-plugin-stealth puppeteer-screen-recorder puppeteer-cluster ghost-cursor @mozilla/readability jsdom unzip-crx-3 node-fetch@2
|
||
|
||
|
||
import assert from 'node:assert/strict';
|
||
import { Buffer } from 'node:buffer';
|
||
import child_process from 'node:child_process';
|
||
import crypto from 'node:crypto';
|
||
import fs from 'node:fs';
|
||
import { createServer } from 'node:http';
|
||
import os from 'node:os';
|
||
import path from 'node:path';
|
||
import querystring from 'node:querysPlugin from "puppeteer-extra-plugin-stealth";
|
||
import PrefsPlugin from 'puppeteer-extra-plugin-user-preferences';
|
||
import { PuppeteerScreenRecorder } from 'puppeteer-screen-recorder';
|
||
// import RecaptchaPlugin from 'puppeteer-extra-plugin-recaptcha';
|
||
// import ReplPlugin from 'puppeteer-extra-plugin-repl';
|
||
|
||
const __dirname = import.meta.dirname
|
||
|
||
import { getDatabase } from './models/init-models.js';
|
||
const { Tag, Snapshot, ArchiveResult } = await getDatabase({ dbpath: './index.sqlite3' })
|
||
|
||
|
||
// move mitm CA cert into /usr/local/share/ca-certificates/mitmproxy-ca-cert.crt
|
||
// update-ca-certificates
|
||
|
||
|
||
const ANSI = {
|
||
reset: "\x1b[0m",
|
||
blue: "\x1b[34m",
|
||
black: "\x1b[30m",
|
||
}
|
||
|
||
/************************* Main Input Arguments *******************************/
|
||
let URLS = [
|
||
// 'chrome://about',
|
||
// 'chrome://system/#chrome_root_store',
|
||
|
||
'https://facebook.com/815781663692514/?comment_id=1508571679703640',
|
||
'https://www.instagram.com/p/CrTY1fENHr5/',
|
||
'https://www.tiktok.com/@zemmour_eric/video/7342474065598319904?cid=7343316616878490400',
|
||
'https://twitter.com/DZasken68678/status/1799833933271687304',
|
||
'https://t.me/IONONMIARRENDOGROUP/13598',
|
||
'https://www.youtube.com/watch?v=rpD0qgzlCms',
|
||
'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/',
|
||
|
||
|
||
'https://gologin.com/check-browser',
|
||
'https://arh.antoinevastel.com/bots/areyouheadless',
|
||
|
||
'https://2captcha.com/demo/hcaptcha',
|
||
'https://2captcha.com/demo/cloudflare-turnstile',
|
||
'https://2captcha.com/demo/recaptcha-v3',
|
||
'https://ipinfo.io/',
|
||
|
||
// 'https://2captcha.com/demo/recaptcha-v2',
|
||
// 'https://2captcha.com/demo/keycaptcha',
|
||
// 'https://browserleaks.com/canvas',
|
||
// 'https://bot.incolumitas.com/#botChallenge',
|
||
// 'https://infosimples.github.io/detect-headless/',
|
||
// 'https://coveryourtracks.eff.org/',
|
||
// 'https://fingerprint.com/demo/',
|
||
// 'https://nowsecure.nl',
|
||
// 'https://abrahamjuliot.github.io/creepjs/',
|
||
// 'https://scrapfly.io/web-scraping-tools/http2-fingerprint',
|
||
// 'https://scrapfly.io/web-scraping-tools/browser-fingerprint',
|
||
// 'https://scrapfly.io/web-scraping-tools/ja3-fingerprint',
|
||
// 'https://scrapfly.io/web-scraping-tools/canvas-fingerprint',
|
||
// 'https://scrapfly.io/web-scraping-tools/webgl-fingerprint',
|
||
// 'https://scrapfly.io/web-scraping-tools/audio-fingerprint',
|
||
// 'https://scrapfly.io/web-scraping-tools/screen-fingerprint',
|
||
// 'https://web-scraping.dev/',
|
||
|
||
|
||
// 'https://example.com',
|
||
// 'https://www.okta.com/',
|
||
// 'https://www.webflow.com/',
|
||
// 'https://docker-compose.archivebox.io',
|
||
// 'https://www.reddit.com/r/AskReddit/comments/1br0q9b/what_was_ok_10_years_ago_but_isnt_today/',
|
||
// 'https://www.quora.com/Is-the-website-2Captcha-true-or-fake-with-paying-money-for-working-on-it',
|
||
// 'https://x.com/yawnzzcalo7/status/1747853178849435894',
|
||
// 'https://twitter.com/yawnzzcalo7/status/1747853178849435894',
|
||
// 'https://rachdele.substack.com/p/is-the-job-market-dying',
|
||
// 'https://www.flowradar.com/cloneables/mouse-image-trail-effect',
|
||
// 'https://wrong.host.badssl.com/',
|
||
// 'http://docker-compose.archivebox.io',
|
||
// 'https://pptr.dev/api/puppeteer.page.setrequestinterception',
|
||
// 'https://blog.sweeting.me#Writing',
|
||
// 'https://github.com/yarnpkg/yarn/issues/9005',
|
||
|
||
// 'https://archive.md/739Oc',
|
||
// 'https://archive.md/Oc72d',
|
||
// 'https://archive.vn/fPUBe',
|
||
// 'https://archive.vn/mRz4P',
|
||
// 'https://archive.vn/Qct6Y',
|
||
// 'https://archive.vn/sv50h',
|
||
// 'https://facebook.com/815781663692514/?comment_id=1508571679703640',
|
||
// 'https://facebook.com/815781663692514/?comment_id=924451748966499',
|
||
// 'https://www.facebook.com/wayne.brennan.528/posts/pfbid02fvxFppng2WsHMavhBa62cXizCBGdmPQRH3CMhac79qzS5C1ADaSNC587d3u6qVbkl',
|
||
// 'https://www.facebook.com/wildeprods/posts/pfbid02YEPfoB7pZqMNzE4y2MpYSQbRAzASquvHyEMzHqrNngJCSL7onEg2jnsqS6epcQHWl',
|
||
// 'https://t.me/aubontouite_francais/9493',
|
||
// 'https://t.me/BC_BLACKMIROR/5044',
|
||
// 'https://t.me/IONONMIARRENDOGROUP/14004',
|
||
// 'https://t.me/newsfactory_pl/51014',
|
||
// 'https://t.me/oliverjanich/132574',
|
||
// 'https://t.me/tomaszgryguc/10449',
|
||
// 'https://t.me/amigosDisidentes/123177',
|
||
// 'https://twitter.com/1nfiltr4do_NN/status/1767238399943991389',
|
||
// 'https://twitter.com/4lmondcookie/status/1748519205438111914',
|
||
// 'https://twitter.com/4olll1ke/status/1753796944827199766',
|
||
// 'https://twitter.com/yeokiloss/status/1754908226179502345',
|
||
// 'https://twitter.com/YoungWaifLover/status/1735667278090297561',
|
||
// 'https://twitter.com/Z_Pour_Demain/status/1766133730278605182',
|
||
// 'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/',
|
||
// 'https://www.aap.com.au/factcheck/absurd-albanese-clip-fools-voice-voters/',
|
||
// 'https://www.instagram.com/_the.forgotten.ones/p/CQQDyoqhsF6/',
|
||
// 'https://www.instagram.com/p/CqSM_f9MR4b/',
|
||
// 'https://www.instagram.com/p/CqSQgf1sv8B/',
|
||
// 'https://instagram.com/p/B-Q22Z_pxyC/',
|
||
// 'https://www.tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400',
|
||
// 'https://tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400',
|
||
// 'https://www.youtube.com/watch?v=rpD0qgzlCms',
|
||
]
|
||
|
||
const isTruthy = (env_value) => ['1', 'yes', 'true'].includes(env_value?.toLowerCase() || 'false')
|
||
|
||
/********************** Config: General High-Level Options ********************/
|
||
|
||
const PASSIVE_ARCHIVING = isTruthy(process.env.PASSIVE_ARCHIVING)
|
||
const CHROME_CLUSTER = isTruthy(process.env.CHROME_CLUSTER)
|
||
const CHROME_CLUSTER_WORKERS = 4
|
||
|
||
const API_SERVER_HOST = '0.0.0.0'
|
||
const API_SERVER_PORT = 9595
|
||
const CHROME_DEBUG_PORT = 9222 // 9222 is default, or use 0 for random port
|
||
|
||
/********************** Config: Keys & Secrets ********************************/
|
||
|
||
const API_KEY_2CAPTCHA = process.env.API_KEY_2CAPTCHA || 'YOUR_API_KEY_HERE'
|
||
const FLARESOLVERR_API_ENDPOINT = process.env.FLARESOLVERR_API_ENDPOINT || "http://localhost:8191/v1"
|
||
|
||
const ACTIVE_PERSONA = process.env.ACTIVE_PERSONA || 'Default'
|
||
const CHROME_PROFILE_USER = process.env.CHROME_PROFILE_USER || 'Default'
|
||
const LOAD_AUTH_STORAGE = isTruthy(process.env.LOAD_AUTH_STORAGE)
|
||
const SAVE_AUTH_STORAGE = isTruthy(process.env.SAVE_AUTH_STORAGE)
|
||
|
||
/********************** Config: Data Dir Locations ****************************/
|
||
|
||
const SRC_DIR = path.resolve(__dirname)
|
||
const DATA_DIR = process.env.DATA_DIR || await fs.promises.realpath(path.join(SRC_DIR, 'data'))
|
||
const INDEXES_DIR = path.join(DATA_DIR, 'index')
|
||
const ARCHIVE_DIR = path.join(DATA_DIR, 'archive')
|
||
if (!fs.existsSync(ARCHIVE_DIR))
|
||
throw 'Could not find data/archive, are you running in the right pwd?'
|
||
|
||
const PERSONA_DIR = path.join(DATA_DIR, 'personas', ACTIVE_PERSONA)
|
||
const CHROME_PROFILE_PATH = path.join(PERSONA_DIR, 'chrome_profile')
|
||
const CHROME_DOWNLOADS_DIR = path.join(PERSONA_DIR, 'chrome_downloads')
|
||
const CHROME_EXTENSIONS_DIR = path.join(PERSONA_DIR, 'chrome_extensions')
|
||
const CHROME_EXTENSIONS_JSON_PATH = path.join(CHROME_EXTENSIONS_DIR, 'extensions.json')
|
||
const AUTH_JSON_PATH = path.join(PERSONA_DIR, 'auth.json')
|
||
const COOKIES_TXT_PATH = path.join(PERSONA_DIR, 'cookies.txt')
|
||
const SPEEDTESTS_DIR = path.join(PERSONA_DIR, 'speedtests')
|
||
// const CHROME_PROFILE_IMPORT_USER = 'Profile 1'
|
||
// const CHROME_PROFILE_IMPORT_PATH = '/Volumes/NVME/Users/squash/Library/Application Support/Google/Chrome'
|
||
|
||
// chrome profile / persona directories
|
||
fs.mkdirSync(PERSONA_DIR, {recursive: true})
|
||
fs.mkdirSync(SPEEDTESTS_DIR, {recursive: true})
|
||
fs.mkdirSync(CHROME_PROFILE_PATH, {recursive: true})
|
||
fs.mkdirSync(CHROME_EXTENSIONS_DIR, {recursive: true})
|
||
fs.mkdirSync(CHROME_DOWNLOADS_DIR, {recursive: true})
|
||
|
||
// cruft directories
|
||
const ORPHANS_DIR = path.join(DATA_DIR, 'orphans')
|
||
const PARTIALS_DIR = path.join(DATA_DIR, 'partials')
|
||
const DUPLICATES_DIR = path.join(DATA_DIR, 'duplicates')
|
||
await fs.promises.mkdir(ORPHANS_DIR, {recursive: true})
|
||
await fs.promises.mkdir(PARTIALS_DIR, {recursive: true})
|
||
await fs.promises.mkdir(DUPLICATES_DIR, {recursive: true})
|
||
|
||
/********************** Config: Viewport Setup Opts ***************************/
|
||
|
||
// Config: Viewport
|
||
const DEFAULT_TIMEOUT = 20_000
|
||
const DEFAULT_GEOLOCATION = {latitude: 59.95, longitude: 30.31667}
|
||
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
|
||
const DEFAULT_ASPECT_RAIO = 16/9 // recommended: 16:9 (most common desktop window aspect ratio)
|
||
const SCREENSHOT_ASPECT_RATIO = 4/3 // recommended: 4:3 (easier to use as thumbnails when square-ish)
|
||
const DEFAULT_WINDOW_WIDTH = 1920 // recommended: 1920x1080p (1080p screenshots)
|
||
const DEFAULT_WINDOW_HEIGHT = Math.floor(DEFAULT_WINDOW_WIDTH/DEFAULT_ASPECT_RAIO)
|
||
const DEFAULT_VIEWPORT = {
|
||
width: DEFAULT_WINDOW_WIDTH,
|
||
height: DEFAULT_WINDOW_HEIGHT,
|
||
deviceScaleFactor: 2, // 2 gives much sharper text in screenshots/pdfs/etc but uses more CPU/GPU
|
||
isMobile: false,
|
||
hasTouch: false,
|
||
isLandscape: false,
|
||
}
|
||
const DEFAULT_COLOR_SCHEME = 'light'
|
||
const DEFAULT_HEADERS = {
|
||
// requires frequent tweaking to remain undetected by cloudflare/recaptcha/etc.
|
||
// 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||
// 'accept-encoding': 'gzip, deflate, br, zstd',
|
||
// 'accept-language': accept_language,
|
||
// 'cache-Control': no_cache ? 'no-cache' : '',
|
||
// 'dnt': '1',
|
||
'sec-ch-ua': '"Google Chrome";v="122", "Not:A-Brand";v="8", "Chromium";v="122"',
|
||
'sec-ch-ua-mobile': '?0',
|
||
'sec-ch-ua-platform': '"macOS"',
|
||
'connection-rtt': '50',
|
||
// 'pragma': no_cache ? 'no-cache' : '',
|
||
// 'sec-fetch-dest': 'document',
|
||
// 'sec-fetch-mode': 'navigate',
|
||
// 'sec-fetch-site': 'none',
|
||
// 'sec-fetch-user': '?1',
|
||
// // 'upgrade-insecure-requests': '1', // breaks some sites, e.g. https://www.flowradar.com/cloneables/mouse-image-trail-effect
|
||
// 'user-agent': user_agent,
|
||
}
|
||
|
||
const DEFAULT_REFERRERS = ["https://www.google.com", "https://www.facebook.com", "https://www.instagram.com"]
|
||
|
||
/****************** Config: Human Behavior Emulation **************************/
|
||
|
||
const SCROLL_LIMIT = 20; // e.g. 30 = 30 * (1000px/2s) => 30,000px scrolled in 60sec
|
||
const SCROLL_DELAY = 1350; // interval per scroll, e.g. 2000 = 2sec to travel 1 * SCROLL_DISTANCE
|
||
const SCROLL_DISTANCE = DEFAULT_VIEWPORT.height - 100; // make sure this is slightly less than viewport height so there is some overlap to make stitching easier
|
||
|
||
/********************** Config: URL Rewriting *********************************/
|
||
const URL_REWRITES = [
|
||
// replacements should come first
|
||
// {
|
||
// idx: 0,
|
||
// pattern: /\/\/(www\.)?x\.com/gi,
|
||
// replacement: '//$1twitter.com/',
|
||
// // TODO: scope: 'hostname',
|
||
// },
|
||
// {
|
||
// idx: 1,
|
||
// pattern: /\/\/(www\.)?twitter\.com/gi,
|
||
// replacement: '//$1nitter.net',
|
||
// // TODO: scope: 'hostname',
|
||
// },
|
||
|
||
// // blocks should come at the end
|
||
// {
|
||
// idx: 999,
|
||
// pattern: /\/\/(www\.)?notallowed\.com/gi,
|
||
// replacement: '',
|
||
// // TODO: scope: 'href',
|
||
// },
|
||
]
|
||
const URL_SCHEMES_IGNORED = [
|
||
'', // no scheme is also invalid (e.g. opening a new tab page without any url yet)
|
||
'chrome',
|
||
'chrome-extension',
|
||
'chrome-untrusted',
|
||
'file',
|
||
'data',
|
||
'about',
|
||
]
|
||
|
||
|
||
/**************** Load existing data/archive/<timestamp> snapshots *************/
|
||
|
||
const snapshots = await Snapshot.findAll({ attributes: ['id', 'timestamp', 'url'] }) // include: { model: ArchiveResult, as: 'archiveresults' }, });
|
||
const results = await ArchiveResult.findAll({ attributes: ['id', 'snapshot_id', 'extractor', 'start_ts'] }) // include: { model: Snapshot, as: 'snapshot' }, });
|
||
globalThis.snapshots = snapshots
|
||
globalThis.results = results
|
||
console.log(`[💿] Found ${snapshots.length} existing snapshots in index.sqlite3...`)
|
||
console.log(`[💿] Found ${results.length} existing results in index.sqlite3...`)
|
||
// debugger;
|
||
|
||
const locateExistingSnapshots = (archive_dir) => {
|
||
const urls_to_dirs = {}
|
||
// for each data/archive/<timestamp>/index.json found, store {url: data/archive/<timestamp>}
|
||
for (const snapshot_dir of fs.readdirSync(archive_dir)) {
|
||
const snapshot_json = path.join(archive_dir, snapshot_dir, 'index.json')
|
||
if (fs.existsSync(snapshot_json)) {
|
||
const {url, archive_path} = JSON.parse(fs.readFileSync(snapshot_json, 'utf-8'))
|
||
if (!snapshot_dir.includes(archive_path.replace('archive/', '')))
|
||
throw 'Found incorrect index.json inside snapshot dir' + snapshot_dir
|
||
if (url && url.includes('://')) {
|
||
urls_to_dirs[url] = path.join(archive_dir, snapshot_dir)
|
||
}
|
||
}
|
||
}
|
||
return urls_to_dirs
|
||
}
|
||
|
||
let SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots(ARCHIVE_DIR)
|
||
|
||
let all_snap_dirs = (await fs.promises.readdir(ARCHIVE_DIR))
|
||
// const orphan_snap_dirs = all_snap_dirs.filter(dirname => dirname.startsWith('19999'))
|
||
|
||
// // scan through existing snapshot dirs, move orphans to orphans/ or correct archive/<snapid>
|
||
// for (const snap_id of orphan_snap_dirs) {
|
||
// if (snap_id.startsWith('.')) continue
|
||
// const src_dir = path.join(ARCHIVE_DIR, snap_id)
|
||
// let src_path = src_dir
|
||
|
||
// assert((await fs.promises.stat(src_dir)).isDirectory())
|
||
// let dest_path = null
|
||
|
||
// const orphan_metrics_path = path.join(src_dir, 'metrics.json')
|
||
// if (fs.existsSync(orphan_metrics_path)) {
|
||
// const orphan_metrics = JSON.parse(await fs.promises.readFile(orphan_metrics_path, 'utf-8'))
|
||
// const url = orphan_metrics.url || orphan_metrics.URL
|
||
// const version = orphan_metrics.VERSION || versionStrFromDate(orphan_metrics.start_time)
|
||
|
||
// // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version
|
||
// await symlinkBestSnapshotResults(src_dir)
|
||
|
||
// dest_path = SNAPSHOT_DIRS_BY_URL[url]
|
||
// const dest_id = dest_path?.split('/').at(-1)
|
||
|
||
// if (dest_id && (dest_id != snap_id)) {
|
||
// if (fs.existsSync(dest_path)) {
|
||
// console.log(` - moving duplicate snap_dir ${src_dir} -> ${dest_path}`)
|
||
// } else {
|
||
// console.log(` - moving valid snap_dir ${src_dir} -> ${dest_path}`)
|
||
// }
|
||
// } else if (dest_id == snap_id) {
|
||
// continue
|
||
// } else {
|
||
// dest_path = path.join(ORPHANS_DIR, snap_id)
|
||
// console.log(` - moving orphan snap_dir ${src_dir} -> ${dest_path}`)
|
||
// }
|
||
// } else {
|
||
// // corrupt/par
|
||
// dest_path = path.join(PARTIALS_DIR, snap_id)
|
||
// console.log(` - moving parial snap_dir ${src_dir} -> ${dest_path}`)
|
||
// }
|
||
// if (dest_path) {
|
||
// for (const version_dir of (await fs.promises.readdir(path.join(src_path, 'versions')))) {
|
||
// const version_src = path.join(src_path, 'versions', version_dir)
|
||
// const version_dst = path.join(dest_path, 'versions', version_dir)
|
||
|
||
// // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version
|
||
// await symlinkBestSnapshotResults(dest_path)
|
||
|
||
// assert(!fs.existsSync(version_dst))
|
||
// await fs.promises.rename(version_src, version_dst)
|
||
// console.log(' - ', version_src, '--->', version_dst)
|
||
// }
|
||
// await fs.promises.rename(src_dir, path.join(PARTIALS_DIR, snap_id))
|
||
// await symlinkBestSnapshotResults(dest_path)
|
||
// }
|
||
// }
|
||
|
||
// const duplicate_snap_dirs = (await fs.promises.readdir(DUPLICATES_DIR)).filter(dirname => dirname.startsWith('19999'))
|
||
// for (const snap_id of duplicate_snap_dirs) {
|
||
// const src_dir = path.join(DUPLICATES_DIR, snap_id)
|
||
// const metrics = JSON.parse(await fs.promises.readFile(path.join(src_dir, 'metrics.json'), 'utf-8'))
|
||
// }
|
||
|
||
// all_snap_dirs = (await fs.promises.readdir(ARCHIVE_DIR))
|
||
// for (const snap_id of all_snap_dirs) {
|
||
// if (snap_id.startsWith('.')) continue
|
||
// const snap_dir = path.join(ARCHIVE_DIR, snap_id)
|
||
// const metrics_path = path.join(snap_dir, 'metrics.json')
|
||
// if (fs.existsSync(metrics_path)) {
|
||
// // console.log(' - updating snap_dir', snap_dir)
|
||
// await symlinkBestSnapshotResults(snap_dir)
|
||
// }
|
||
// }
|
||
// SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots(ARCHIVE_DIR)
|
||
|
||
|
||
fs.writeFileSync(path.join(DATA_DIR, 'queue.csv'), '')
|
||
|
||
const snapIdFromDir = (dir_path) =>
|
||
dir_path.split('/archive/').at(-1)
|
||
|
||
const snapshot_dir_list = (
|
||
Object.entries(SNAPSHOT_DIRS_BY_URL)
|
||
.sort(([_ak, a], [_bk, b]) =>
|
||
Number(snapIdFromDir(b)) - Number(snapIdFromDir(a)))
|
||
.reverse())
|
||
|
||
for (const [existing_url, snapshot_dir] of snapshot_dir_list) {
|
||
// if (existing_url.startsWith('https://www.facebook.com/')) {
|
||
const is_desired_url = !(existing_url.includes('facebook.com/') || existing_url.includes('instagram.com/'))
|
||
const already_archived = false // fs.existsSync(path.join(SNAPSHOT_DIRS_BY_URL[existing_url], 'versions'))
|
||
if (is_desired_url && !already_archived) {
|
||
// URLS.push(existing_url)
|
||
fs.appendFileSync(
|
||
path.join(DATA_DIR, 'queue.csv'),
|
||
`${SNAPSHOT_DIRS_BY_URL[existing_url]},${existing_url}\n`,
|
||
'utf-8',
|
||
)
|
||
}
|
||
}
|
||
URLS = [...new Set(URLS)]
|
||
console.log('[+] Added', URLS.length, 'existing urls to queue...')
|
||
|
||
|
||
/********************** Config: Output Paths **********************************/
|
||
// const TASK_PATH = (url) => path.join(DATA_DIR, 'results', `${hashCode(url)}`)
|
||
const TASK_PATH = (url) => SNAPSHOT_DIRS_BY_URL[url] || path.join(ARCHIVE_DIR, `1999999999.${hashCode(url)}`)
|
||
// const TASK_PATH = (url) => {
|
||
// const existing_snap_dir = SNAPSHOT_DIRS_BY_URL[url]
|
||
// assert(existing_snap_dir, `Could not find existing snapshot dir for ${url}`)
|
||
// return existing_snap_dir
|
||
// }
|
||
|
||
const OUTPUT_PATH = (page, filename, extname='') =>
|
||
path.join(TASK_PATH(page._original_url), `${filename}${extname}`)
|
||
|
||
const SSL_PATH = (page) => OUTPUT_PATH(page, 'ssl.json')
|
||
const CONSOLELOG_PATH = (page) => OUTPUT_PATH(page, 'console.log')
|
||
const HEADERS_PATH = (page) => OUTPUT_PATH(page, 'headers.json')
|
||
const REDIRECTS_PATH = (page) => OUTPUT_PATH(page, 'redirects.json')
|
||
const REQUESTS_PATH = (page) => OUTPUT_PATH(page, 'requests.json')
|
||
const TRACE_PATH = (page) => OUTPUT_PATH(page, 'trace.json')
|
||
const METRICS_PATH = (page) => OUTPUT_PATH(page, 'metrics.json')
|
||
const OUTLINKS_PATH = (page) => OUTPUT_PATH(page, 'outlinks.json')
|
||
const SEO_PATH = (page) => OUTPUT_PATH(page, 'seo.json')
|
||
const FAVICON_PATH = (page) => OUTPUT_PATH(page, 'favicon.json')
|
||
const TITLE_PATH = (page) => OUTPUT_PATH(page, 'title.txt')
|
||
const BODYTEXT_PATH = (page) => OUTPUT_PATH(page, 'body.txt')
|
||
const PANDOC_PATH = (page) => OUTPUT_PATH(page, 'pandoc.md')
|
||
const READABILITY_PATH = (page) => OUTPUT_PATH(page, 'readability.json')
|
||
const ACCESIBILITY_PATH = (page) => OUTPUT_PATH(page, 'accessibility.json')
|
||
const DOM_PATH = (page) => OUTPUT_PATH(page, 'dom.html')
|
||
const PDF_PATH = (page) => OUTPUT_PATH(page, 'output.pdf')
|
||
const SCREENSHOT_PATH = (page) => OUTPUT_PATH(page, 'screenshot.png')
|
||
const SCREENSHOT_JPG_PATH = (page) => OUTPUT_PATH(page, 'screenshot.jpg')
|
||
const AIQA_PATH = (page) => OUTPUT_PATH(page, 'aiqa.json')
|
||
const SINGLEFILE_PATH = (page) => OUTPUT_PATH(page, 'singlefile.html')
|
||
const YTDLP_PATH = (page) => OUTPUT_PATH(page, 'media/')
|
||
const GALLERYDL_PATH = (page) => OUTPUT_PATH(page, 'photos/')
|
||
const SCREENRECORDING_PATH = (page) => OUTPUT_PATH(page, 'screenrecording.mp4')
|
||
const SCREENRECORDGIF_PATH = (page) => OUTPUT_PATH(page, 'screenrecording.gif')
|
||
const RESPONSES_PATH = (page) => OUTPUT_PATH(page, 'responses')
|
||
const RAW_PATH = (page) => OUTPUT_PATH(page, 'raw')
|
||
|
||
|
||
|
||
/********************** Config: Chrome Extensions *****************************/
|
||
|
||
interface ChromeExtension {
|
||
name: string
|
||
webstore_id: string
|
||
}
|
||
interface LoadedChromeExtension extends ChromeExtension {
|
||
id?: string
|
||
webstore_url?: string
|
||
crx_url?: string
|
||
crx_path?: string
|
||
unpacked_path?: string
|
||
read_manifest?: () => any
|
||
read_version?: () => string | null
|
||
}
|
||
|
||
const CHROME_EXTENSIONS: LoadedChromeExtension[] = [
|
||
// Content access / unblocking / blocking plugins
|
||
{webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: 'captcha2'}, // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
|
||
{webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', name: 'istilldontcareaboutcookies'},
|
||
{webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', name: 'ublock'},
|
||
// {webstore_id: 'mlomiejdfkolichcflejclcbmpeaniij', name: 'ghostery'},
|
||
// {webstore_id: 'mnjggcdmjocbbbhaepdhchncahnbgone', name: 'sponsorblock'},
|
||
// {webstore_id: 'iplffkdpngmdjhlpjmppncnlhomiipha', name: 'unpaywall'},
|
||
// {webstore_id: 'gofocbepaccnkpphbgjpolififgcakhn', name: 'spaywallnews'},
|
||
|
||
// Archiving plugins
|
||
{webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle', name: 'singlefile'},
|
||
// {webstore_id: 'fpeoodllldobpkbkabpblcfaogecpndd', name: 'archivewebpage'},
|
||
// {webstore_id: 'niloccemoadcdkdjlinkgdfekeahmflj', name: 'pocket'},
|
||
// {webstore_id: 'kenncghfghgolcbmckhiljgaabnpcaaa', name: 'warcreate'},
|
||
// {webstore_id: 'jjndjgheafjngoipoacpjgeicjeomjli', name: 'puppeteerstream'},
|
||
|
||
// Utilities for humans setting up/viewing/debugging the archiving session
|
||
// {webstore_id: 'aeblfdkhhhdcdjpifhhbdiojplfjncoa', name: '1password'},
|
||
// {webstore_id: 'fngmhnnpilhplaeedifhccceomclgfbg', name: 'editthiscookie'},
|
||
// {webstore_id: 'cgfpgnepljlgenjclbekbjdlgcodfmjp', name: 'simpletabsorter'},
|
||
|
||
// Scripting/automation plugins
|
||
// {webstore_id: 'jinjaccalgkegednnccohejagnlnfdag', name: 'violentmonkey'},
|
||
// {webstore_id: 'infppggnoaenmfagbfknfkancpbljcca', name: 'automa'},
|
||
// {webstore_id: 'pfegffhjcgkneoemnlniggnhkfioidjg', name: 'screenscraper'},
|
||
]
|
||
|
||
/******************** Config: Chrome Profile Preferences **********************/
|
||
|
||
// https://niek.github.io/chrome-features/
|
||
const CHROME_DISABLED_COMPONENTS = [
|
||
'Translate',
|
||
'AcceptCHFrame',
|
||
'OptimizationHints',
|
||
'ProcessPerSiteUpToMainFrameThreshold',
|
||
'InterestFeedContentSuggestions',
|
||
'CalculateNativeWinOcclusion',
|
||
'BackForwardCache',
|
||
'HeavyAdPrivacyMitigations',
|
||
'LazyFrameLoading',
|
||
'ImprovedCookieControls',
|
||
'PrivacySandboxSettings4',
|
||
'AutofillServerCommunication',
|
||
'CertificateTransparencyComponentUpdater',
|
||
'DestroyProfileOnBrowserClose',
|
||
'CrashReporting',
|
||
'OverscrollHistoryNavigation',
|
||
'InfiniteSessionRestore',
|
||
//'LockProfileCookieDatabase', // disabling allows multiple chrome instances to concurrently modify profile, but might make chrome much slower https://github.com/yt-dlp/yt-dlp/issues/7271 https://issues.chromium.org/issues/40901624
|
||
]
|
||
|
||
const CHROME_PREFERENCES_EXTRA = {}
|
||
const CHROME_PREFERENCES_DEFAULT = {
|
||
// https://chromium.googlesource.com/chromium/src/+/32352ad08ee673a4d43e8593ce988b224f6482d3/chrome/common/pref_names.cc
|
||
homepage: 'about:blank', // doesn't work here, managed by Secure Preferences
|
||
homepage_is_newtabpage: false, // doesn't work here, managed by Secure Preferences
|
||
session: { // doesn't work here, managed by Secure Preferences
|
||
restore_on_startup: 4, // doesn't work here, managed by Secure Preferences
|
||
startup_urls: 'about:blank', // doesn't work here, managed by Secure Preferences
|
||
},
|
||
default_apps: 'noinstall',
|
||
browser: {
|
||
confirm_to_quit: false,
|
||
enable_spellchecking: false,
|
||
check_default_browser: false,
|
||
show_update_promotion_info_bar: false,
|
||
},
|
||
profile: {
|
||
// name: 'ArchiveBox Persona: Default', // doesnt work to change display name, not sure why
|
||
// using_default_name: false,
|
||
exited_cleanly: true,
|
||
default_content_setting_values: {
|
||
automatic_downloads: 1,
|
||
},
|
||
},
|
||
bookmark_bar: {show_on_all_tabs: false},
|
||
safebrowsing: {enabled: false},
|
||
search: {suggest_enabled: false},
|
||
download: {
|
||
prompt_for_download: false,
|
||
open_pdf_in_system_reader: true,
|
||
// default_directory: CHROME_DOWNLOADS_DIR || path.join(__dirname, 'downloads'),
|
||
},
|
||
select_file_dialogs: {allowed: false},
|
||
autofill: {save_data: false},
|
||
printing: {enabled: false},
|
||
message_center: {welcome_notification_dismissed_local: true},
|
||
extensions: {
|
||
ui: {
|
||
developer_mode: true,
|
||
dismissed_adt_promo: true,
|
||
},
|
||
// pinned_extensions: CHROME_EXTENSIONS?.map(({id}) => id) || [],
|
||
},
|
||
webkit: {
|
||
webprefs: {
|
||
javascript_enabled: true,
|
||
minimum_font_size: 9,
|
||
// default_font_size: 12,
|
||
// web_security_enabled: false,
|
||
// allow_displaying_insecure_content: true,
|
||
// allow_running_insecure_content: true,
|
||
java_enabled: true,
|
||
loads_images_automatically: true,
|
||
},
|
||
},
|
||
settings: {
|
||
multi_profile_never_show_intro: true,
|
||
multi_profile_warning_show_dismissed: true,
|
||
first_run_tutorial_shown: true,
|
||
},
|
||
plugins: {
|
||
always_open_pdf_externally: true,
|
||
},
|
||
}
|
||
|
||
const CHROME_PREFERENCES_PATH = path.join(CHROME_PROFILE_PATH, 'Default', 'Preferences')
|
||
|
||
const getChromePreferences = ({CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, CHROME_EXTENSIONS, CHROME_DOWNLOADS_DIR}) =>
|
||
merge.all([CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, {
|
||
extensions: {
|
||
pinned_extensions: CHROME_EXTENSIONS?.map(({id}) => id) || [],
|
||
},
|
||
download: {
|
||
default_directory: CHROME_DOWNLOADS_DIR || path.join(__dirname, 'downloads'),
|
||
},
|
||
}])
|
||
|
||
function applyChromePreferences(puppeteer, prefs_path, preferences) {
|
||
if (fs.existsSync(prefs_path)) {
|
||
const preferences_existing = JSON.parse(fs.readFileSync(prefs_path, 'utf-8'))
|
||
const preferences_merged = merge(preferences_existing, preferences)
|
||
// console.log(JSON.stringify(preferences_merged, null, 4))
|
||
fs.writeFileSync(prefs_path, JSON.stringify(preferences_merged))
|
||
} else {
|
||
// otherwise profile has not been created yet, use plugin instead (plugin only works on first creation)
|
||
puppeteer.use(PrefsPlugin({userPrefs: preferences}))
|
||
}
|
||
return puppeteer
|
||
}
|
||
|
||
|
||
/******************** Config: Chrome Launch Args ******************************/
|
||
|
||
const CHROME_ARGS_DEFAULT = [
|
||
// Headless behavior tuning, determinstic behavior settings
|
||
// '--headless=new',
|
||
'--test-type',
|
||
'--test-type=gpu', // https://github.com/puppeteer/puppeteer/issues/10516
|
||
'--deterministic-mode',
|
||
'--js-flags=--random-seed=1157259159', // make all JS random numbers deterministic by providing a seed
|
||
'--allow-pre-commit-input', // allow JS mutations before page rendering is complete
|
||
'--disable-blink-features=AutomationControlled', // hide the signatures that announce browser is being remote-controlled
|
||
'--enable-automation', // <- DONT USE THIS, it makes you easily detectable / blocked by cloudflare
|
||
// `--proxy-server=https://43.159.28.126:2334:u7ce652b7568805c4-zone-custom-region-us-session-szGWq3FRU-sessTime-60:u7ce652b7568805c4`, // send all network traffic through a proxy https://2captcha.com/proxy
|
||
// `--proxy-bypass-list=127.0.0.1`,
|
||
|
||
// Docker-specific options
|
||
// https://github.com/GoogleChrome/lighthouse-ci/tree/main/docs/recipes/docker-client#--no-sandbox-issues-explained
|
||
// '--no-sandbox', // rely on docker sandboxing in docker, otherwise we need cap_add: SYS_ADM to use host sandboxing
|
||
// '--disable-gpu-sandbox',
|
||
// '--disable-setuid-sandbox',
|
||
// '--disable-dev-shm-usage', // docker 75mb default shm size is not big enough, disabling just uses /tmp instead
|
||
// '--no-xshm',
|
||
|
||
// Profile data dir setup
|
||
// chrome://profile-internals
|
||
`--user-data-dir=${CHROME_PROFILE_PATH}`,
|
||
`--profile-directory=${CHROME_PROFILE_USER}`,
|
||
'--password-store=basic', // use mock keychain instead of OS-provided keychain (we manage auth.json instead)
|
||
'--use-mock-keychain',
|
||
'--disable-cookie-encryption', // we need to be able to write unencrypted cookies to save/load auth.json
|
||
// '--disable-sync', // don't try to use Google account sync features
|
||
|
||
// Extensions
|
||
// chrome://inspect/#extensions
|
||
// `--load-extension=${CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}`, // not needed when using existing profile that already has extensions installed
|
||
`--allowlisted-extension-id=${CHROME_EXTENSIONS.map(({ webstore_id }) => webstore_id).join(',')}`,
|
||
'--allow-legacy-extension-manifests',
|
||
|
||
// Browser window and viewport setup
|
||
// chrome://version
|
||
// `--user-agent="${DEFAULT_USER_AGENT}"`,
|
||
// `--window-size=${DEFAULT_VIEWPORT.width},${DEFAULT_VIEWPORT.height}`,
|
||
'--window-position=0,0',
|
||
'--hide-scrollbars', // hide scrollbars because otherwise they show up in screenshots
|
||
'--install-autogenerated-theme=169,32,85', // red border makes it easier to see which chrome window is archivebox's
|
||
'--virtual-time-budget=60000', // fast-forward all animations & timers by 60s
|
||
'--autoplay-policy=no-user-gesture-required', // auto-start videos so they trigger network requests + show up in outputs
|
||
'--disable-gesture-requirement-for-media-playback',
|
||
'--lang=en-US,en;q=0.9',
|
||
|
||
// DANGER: JS isolation security features (to allow easier tampering with pages during archiving)
|
||
// chrome://net-internals
|
||
// '--disable-web-security', // <- WARNING, breaks some sites that expect/enforce strict CORS headers (try webflow.com)
|
||
// '--disable-features=IsolateOrigins,site-per-process', // useful for injecting JS, but some very strict sites can panic / show error pages when isolation is disabled (e.g. webflow.com)
|
||
// '--allow-running-insecure-content', // Breaks CORS/CSRF/HSTS etc., useful sometimes but very easy to detect
|
||
// '--allow-file-access-from-files', // <- WARNING, dangerous, allows JS to read filesystem using file:// URLs
|
||
|
||
// // DANGER: Disable HTTPS verification
|
||
// '--ignore-certificate-errors',
|
||
// '--ignore-ssl-errors',
|
||
// '--ignore-certificate-errors-spki-list',
|
||
// '--allow-insecure-localhost',
|
||
|
||
// IO: stdin/stdout, debug port config
|
||
// chrome://inspect
|
||
'--log-level=2', // 1=DEBUG 2=WARNING 3=ERROR
|
||
'--enable-logging=stderr',
|
||
'--remote-debugging-address=0.0.0.0',
|
||
`--remote-debugging-port=${CHROME_DEBUG_PORT}`,
|
||
|
||
// GPU, canvas, text, and pdf rendering config
|
||
// chrome://gpu
|
||
'--enable-webgl', // enable web-gl graphics support
|
||
'--font-render-hinting=none', // make rendering more deterministic by ignoring OS font hints, may also need css override, try: * {text-rendering: geometricprecision !important; -webkit-font-smoothing: antialiased;}
|
||
'--force-color-profile=srgb', // make rendering more deterministic by using consitent color profile, if browser looks weird, try: generic-rgb
|
||
'--disable-partial-raster', // make rendering more deterministic (TODO: verify if still needed)
|
||
'--disable-skia-runtime-opts', // make rendering more deterministic by avoiding Skia hot path runtime optimizations
|
||
'--disable-2d-canvas-clip-aa', // make rendering more deterministic by disabling antialiasing on 2d canvas clips
|
||
// '--disable-gpu', // falls back to more consistent software renderer
|
||
// // '--use-gl=swiftshader', <- DO NOT USE, breaks M1 ARM64. it makes rendering more deterministic by using simpler CPU renderer instead of OS GPU renderer bug: https://groups.google.com/a/chromium.org/g/chromium-dev/c/8eR2GctzGuw
|
||
// // '--disable-software-rasterizer', <- DO NOT USE, harmless, used in tandem with --disable-gpu
|
||
// // '--run-all-compositor-stages-before-draw', <- DO NOT USE, makes headful chrome hang on startup (tested v121 Google Chrome.app on macOS)
|
||
// // '--disable-gl-drawing-for-tests', <- DO NOT USE, disables gl output (makes tests run faster if you dont care about canvas)
|
||
// // '--blink-settings=imagesEnabled=false', <- DO NOT USE, disables images entirely (only sometimes useful to speed up loading)
|
||
|
||
// Process management & performance tuning
|
||
// chrome://process-internals
|
||
'--disable-lazy-loading', // make rendering more deterministic by loading all content up-front instead of on-focus
|
||
'--disable-renderer-backgrounding', // dont throttle tab rendering based on focus/visibility
|
||
'--disable-background-networking', // dont throttle tab networking based on focus/visibility
|
||
'--disable-background-timer-throttling', // dont throttle tab timers based on focus/visibility
|
||
'--disable-backgrounding-occluded-windows', // dont throttle tab window based on focus/visibility
|
||
'--disable-ipc-flooding-protection', // dont throttle ipc traffic or accessing big request/response/buffer/etc. objects will fail
|
||
'--disable-extensions-http-throttling', // dont throttle http traffic based on runtime heuristics
|
||
'--disable-field-trial-config', // disable shared field trial state between browser processes
|
||
'--disable-back-forward-cache', // disable browsing navigation cache
|
||
// '--in-process-gpu', <- DONT USE THIS, makes headful startup time ~5-10s slower (tested v121 Google Chrome.app on macOS)
|
||
// '--disable-component-extensions-with-background-pages', // TODO: check this, disables chrome components that only run in background (could lower startup time)
|
||
|
||
// uncomment to disable hardware camera/mic/speaker access + present fake devices to websites
|
||
// (faster to disable, but disabling breaks recording browser audio in puppeteer-stream screenrecordings)
|
||
// '--use-fake-device-for-media-stream',
|
||
// '--use-fake-ui-for-media-stream',
|
||
// '--disable-features=GlobalMediaControls,MediaRouter,DialMediaRouteProvider',
|
||
|
||
// // Output format options (PDF, screenshot, etc.)
|
||
'--export-tagged-pdf', // include table on contents and tags in printed PDFs
|
||
'--generate-pdf-document-outline',
|
||
|
||
// Suppress first-run features, popups, hints, updates, etc.
|
||
// chrome://system
|
||
'--no-pings',
|
||
'--no-first-run',
|
||
'--no-default-browser-check',
|
||
'--disable-default-apps',
|
||
'--ash-no-nudges',
|
||
'--disable-infobars',
|
||
'--disable-search-engine-choice-screen',
|
||
'--disable-session-crashed-bubble',
|
||
'--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"',
|
||
'--hide-crash-restore-bubble',
|
||
'--suppress-message-center-popups',
|
||
'--disable-client-side-phishing-detection',
|
||
'--disable-domain-reliability',
|
||
'--disable-component-update',
|
||
'--disable-datasaver-prompt',
|
||
'--disable-hang-monitor',
|
||
'--disable-session-crashed-bubble',
|
||
'--disable-speech-synthesis-api',
|
||
'--disable-speech-api',
|
||
'--disable-print-preview',
|
||
'--safebrowsing-disable-auto-update',
|
||
'--deny-permission-prompts',
|
||
'--disable-external-intent-requests',
|
||
'--disable-notifications',
|
||
'--disable-desktop-notifications',
|
||
'--noerrdialogs',
|
||
'--disable-popup-blocking',
|
||
'--disable-prompt-on-repost',
|
||
'--silent-debugger-extension-api',
|
||
'--block-new-web-contents',
|
||
'--metrics-recording-only',
|
||
'--disable-breakpad',
|
||
|
||
|
||
// other feature flags
|
||
// chrome://flags chrome://components
|
||
`--disable-features=${CHROME_DISABLED_COMPONENTS.join(',')}`,
|
||
'--enable-features=NetworkService',
|
||
]
|
||
const CHROME_ARGS_EXTRA = []
|
||
|
||
|
||
const CHROME_LAUNCH_OPTIONS = {
|
||
CHROME_PROFILE_PATH,
|
||
CHROME_PROFILE_USER,
|
||
CHROME_EXTENSIONS,
|
||
CHROME_DEBUG_PORT,
|
||
CHROME_DISABLED_COMPONENTS,
|
||
DEFAULT_VIEWPORT,
|
||
CHROME_ARGS_DEFAULT,
|
||
CHROME_ARGS_EXTRA,
|
||
}
|
||
/* Chrome CLI Args Documentation
|
||
- https://github.com/GoogleChrome/chrome-launcher/blob/main/docs/chrome-flags-for-tools.md
|
||
- https://chromium.googlesource.com/chromium/chromium/+/master/content/public/common/content_switches.cc
|
||
- https://jtway.co/optimize-your-chrome-options-for-testing-to-get-x1-25-impact-4f19f071bf45
|
||
- https://peter.sh/experiments/chromium-command-line-switches/
|
||
- https://www.chromium.org/developers/how-tos/run-chromium-with-flags/
|
||
- https://github.com/manoj9788/Chrome-Driver-arguments/blob/master/README.md
|
||
*/
|
||
const getChromeArgs = ({CHROME_ARGS_DEFAULT, CHROME_ARGS_EXTRA,
|
||
CHROME_PROFILE_PATH, CHROME_PROFILE_USER,
|
||
CHROME_EXTENSIONS,
|
||
CHROME_DEBUG_PORT,
|
||
CHROME_DISABLED_COMPONENTS,
|
||
DEFAULT_VIEWPORT}=CHROME_LAUNCH_OPTIONS) =>
|
||
[
|
||
...CHROME_ARGS_DEFAULT,
|
||
`--user-data-dir=${CHROME_PROFILE_PATH}`,
|
||
`--profile-directory=${CHROME_PROFILE_USER}`,
|
||
`--load-extension=${CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}`,
|
||
`--allowlisted-extension-id=${CHROME_EXTENSIONS.map(({id}) => id).join(',')}`,
|
||
`--window-size=${DEFAULT_VIEWPORT.width},${DEFAULT_VIEWPORT.height}`,
|
||
`--remote-debugging-port=${CHROME_DEBUG_PORT}`,
|
||
`--disable-features=${CHROME_DISABLED_COMPONENTS.join(',')}`,
|
||
...CHROME_ARGS_EXTRA,
|
||
]
|
||
|
||
|
||
/******************** Chrome Extension Management *****************************/
|
||
|
||
function getExtensionId(unpacked_path) {
|
||
const manifest_path = path.join(unpacked_path, 'manifest.json')
|
||
if (!fs.existsSync(manifest_path)) return null
|
||
|
||
// chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id
|
||
const hash = crypto.createHash('sha256');
|
||
hash.update(Buffer.from(unpacked_path, 'utf-8'));
|
||
const detected_extension_id = Array.from(hash.digest('hex'))
|
||
.slice(0, 32) // Convert each hexadecimal character to a character in the range 'a'-'p'
|
||
.map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0)))
|
||
.join('');
|
||
|
||
return detected_extension_id
|
||
}
|
||
|
||
async function installExtension(extension) {
|
||
const manifest_path = path.join(extension.unpacked_path, 'manifest.json')
|
||
|
||
// Download extensions using:
|
||
// curl -fsSL 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D$EXTENSION_ID%26uc' > extensionname.crx
|
||
// unzip -d extensionname extensionname.zip
|
||
|
||
if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) {
|
||
console.log("[🛠️] Downloading missing extension", extension.name, extension.webstore_id, '->', extension.crx_path);
|
||
|
||
// Download crx file from ext.crx_url -> ext.crx_path
|
||
const response = await fetch(extension.crx_url) as Response
|
||
const crx_file = fs.createWriteStream(extension.crx_path);
|
||
if (response.headers.get("content-length") && response.body) {
|
||
// @ts-ignore
|
||
const crx_stream = Readable.fromWeb(response.body)
|
||
await finished(crx_stream.pipe(crx_file))
|
||
} else {
|
||
console.warn('[⚠️] Failed to download extension', extension.name, extension.webstore_id)
|
||
}
|
||
}
|
||
|
||
var {stdout, stderr} = {stdout: '', stderr: ''}
|
||
|
||
// Unzip crx file from ext.crx_url -> ext.unpacked_path
|
||
await fs.promises.mkdir(extension.unpacked_path, {recursive: true})
|
||
try {
|
||
var {stdout, stderr} = await exec(`/usr/bin/unzip ${extension.crx_path} -d ${extension.unpacked_path}`)
|
||
} catch(err1) {
|
||
try {
|
||
await unzip(extension.crx_path, extension.unpacked_path)
|
||
} catch(err2) {
|
||
// console.error(`[❌] Failed to install ${extension.crx_path}: could not unzip crx`, err1, err2)
|
||
// return false
|
||
}
|
||
}
|
||
|
||
if (!fs.existsSync(manifest_path))
|
||
console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`, stdout, stderr)
|
||
|
||
return fs.existsSync(manifest_path)
|
||
}
|
||
|
||
async function loadOrInstallExtension(ext) {
|
||
if (!(ext.webstore_id || ext.unpacked_path))
|
||
throw 'Extension must have either {webstore_id} or {unpacked_path}'
|
||
|
||
// Set statically computable extension metadata
|
||
ext.webstore_id = ext.webstore_id || ext.id
|
||
ext.name = ext.name || ext.webstore_id
|
||
ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`
|
||
ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`
|
||
ext.crx_path = ext.crx_path || path.join(CHROME_EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`)
|
||
ext.unpacked_path = ext.unpacked_path || path.join(CHROME_EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`)
|
||
|
||
const manifest_path = path.join(ext.unpacked_path, 'manifest.json')
|
||
ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8'))
|
||
ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null
|
||
|
||
// if extension is not installed, download and unpack it
|
||
if (!ext.read_version()) {
|
||
await installExtension(ext)
|
||
}
|
||
|
||
// autodetect id from filesystem path (unpacked extensions dont have stable IDs)
|
||
ext.id = getExtensionId(ext.unpacked_path)
|
||
ext.version = ext.read_version()
|
||
if (!ext.version) {
|
||
console.warn('[❌] Unable to detect ID and version of installed extension', prettyPath(ext.unpacked_path))
|
||
} else {
|
||
console.log(`[➕] Installed extension ${ext.name} (${ext.version})...`.padEnd(82), prettyPath(ext.unpacked_path))
|
||
}
|
||
|
||
return ext
|
||
}
|
||
|
||
async function isTargetExtension(target) {
|
||
let target_type
|
||
let target_ctx
|
||
let target_url
|
||
try {
|
||
target_type = target.type()
|
||
target_ctx = (await target.worker()) || (await target.page()) || null
|
||
target_url = target.url() || target_ctx?.url() || null
|
||
} catch(err) {
|
||
if (String(err).includes('No target with given id found')) {
|
||
// because this runs on initial browser startup, we sometimes race with closing the initial
|
||
// new tab page. it will throw a harmless error if we try to check a target that's already closed,
|
||
// ignore it and return null since that page is definitely not an extension's bg page anyway
|
||
target_type = 'closed'
|
||
target_ctx = null
|
||
target_url = 'about:closed'
|
||
} else {
|
||
throw err
|
||
}
|
||
}
|
||
|
||
const target_is_bg = ['service_worker', 'background_page'].includes(target_type)
|
||
const target_is_extension = target_url?.startsWith('chrome-extension://')
|
||
const extension_id = (target_is_extension && target_url.split('://')[1].split('/')[0]) || null
|
||
const manifest_version = target_type === 'service_worker' ? '3' : '2'
|
||
|
||
return {
|
||
target_type,
|
||
target_ctx,
|
||
target_url,
|
||
target_is_bg,
|
||
target_is_extension,
|
||
extension_id,
|
||
manifest_version,
|
||
}
|
||
}
|
||
|
||
async function loadExtensionFromTarget(extensions, target) {
|
||
const {
|
||
target_is_bg,
|
||
target_is_extension,
|
||
target_type,
|
||
target_ctx,
|
||
target_url,
|
||
extension_id,
|
||
manifest_version,
|
||
} = await isTargetExtension(target)
|
||
|
||
if (!(target_is_bg && extension_id && target_ctx))
|
||
return null
|
||
|
||
const manifest = await target_ctx.evaluate(() =>
|
||
// @ts-ignore
|
||
chrome.runtime.getManifest())
|
||
|
||
const { name, version, homepage_url, options_page, options_ui } = manifest
|
||
|
||
if (!version || !extension_id)
|
||
return null
|
||
|
||
const options_url = await target_ctx.evaluate(
|
||
(options_page) => chrome.runtime.getURL(options_page),
|
||
options_page || options_ui?.page || 'options.html',
|
||
)
|
||
|
||
const commands = await target_ctx.evaluate(async () =>
|
||
(await new Promise((resolve, reject) => {
|
||
if (chrome.commands)
|
||
chrome.commands.getAll(resolve)
|
||
else
|
||
resolve({})
|
||
}))
|
||
)
|
||
|
||
// console.log(`[+] Found Manifest V${manifest_version} Extension:`, extension_id, name, target_url, Object.keys(commands).length)
|
||
|
||
let dispatchEval = async (...args) =>
|
||
await target_ctx.evaluate(...args)
|
||
let dispatchPopup = async () =>
|
||
await target_ctx.evaluate('chrome.action?.openPopup() || chrome.tabs.create({url: chrome.runtime.getURL("popup.html")})')
|
||
|
||
let dispatchAction
|
||
let dispatchMessage
|
||
let dispatchCommand
|
||
|
||
if (manifest_version === '3') {
|
||
dispatchAction = async (tab) => {
|
||
// https://developer.chrome.com/docs/extensions/reference/api/action#event-onClicked
|
||
return await target_ctx.evaluate(async (tab) => {
|
||
tab = tab || (await new Promise((resolve) =>
|
||
chrome.tabs.query({currentWindow: true, active: true}, ([tab]) => resolve(tab))))
|
||
// @ts-ignore
|
||
return await chrome.action.onClicked.dispatch(tab)
|
||
}, tab)
|
||
}
|
||
dispatchMessage = async (message, options) => {
|
||
// https://developer.chrome.com/docs/extensions/reference/api/runtime
|
||
return await target_ctx.evaluate(async (extension_id, message, options) => {
|
||
return await chrome.runtime.sendMessage(extension_id, message, options)
|
||
}, extension_id, message, options)
|
||
}
|
||
dispatchCommand = async (command, tab) => {
|
||
// https://developer.chrome.com/docs/extensions/reference/api/commands#event-onCommand
|
||
return await target_ctx.evaluate(async (command, tab) => {
|
||
// @ts-ignore
|
||
return await chrome.commands.onCommand.dispatch(command, tab)
|
||
}, command, tab)
|
||
}
|
||
} else if (manifest_version === '2') {
|
||
dispatchAction = async (tab) => {
|
||
// https://developer.chrome.com/docs/extensions/mv2/reference/browserAction#event-onClicked
|
||
return await target_ctx.evaluate(async (tab) => {
|
||
tab = tab || (await new Promise((resolve) =>
|
||
chrome.tabs.query({currentWindow: true, active: true}, ([tab]) => resolve(tab))))
|
||
// @ts-ignore
|
||
return await chrome.browserAction.onClicked.dispatch(tab)
|
||
}, tab)
|
||
}
|
||
dispatchMessage = async (message, options) => {
|
||
// https://developer.chrome.com/docs/extensions/mv2/reference/runtime#method-sendMessage
|
||
return await target_ctx.evaluate(async (extension_id, message, options) => {
|
||
return await new Promise((resolve) =>
|
||
chrome.runtime.sendMessage(extension_id, message, options, resolve)
|
||
)
|
||
}, extension_id, message, options)
|
||
}
|
||
dispatchCommand = async (command, tab) => {
|
||
// https://developer.chrome.com/docs/extensions/mv2/reference/commands#event-onCommand
|
||
return await target_ctx.evaluate(async (command, tab) => {
|
||
return await new Promise((resolve) =>
|
||
// @ts-ignore
|
||
chrome.commands.onCommand.dispatch(command, tab, resolve)
|
||
)
|
||
}, command, tab)
|
||
}
|
||
}
|
||
const existing_extension = extensions.filter(({id}) => id === extension_id)[0] || {}
|
||
|
||
const new_extension = {
|
||
...existing_extension,
|
||
id: extension_id,
|
||
webstore_name: name,
|
||
|
||
target,
|
||
target_ctx,
|
||
target_type,
|
||
target_url,
|
||
|
||
manifest_version,
|
||
manifest,
|
||
version,
|
||
homepage_url,
|
||
options_url,
|
||
|
||
dispatchEval, // run some JS in the extension's service worker context
|
||
dispatchPopup, // open the extension popup
|
||
dispatchAction, // trigger an extension menubar icon click
|
||
dispatchMessage, // send a chrome runtime message in the service worker context
|
||
dispatchCommand, // trigger an extension keyboard shortcut command
|
||
}
|
||
|
||
console.log(`[➕] Loaded extension ${name.substring(0, 32)} (${version}) ${target_type}...`.padEnd(82), target_url)
|
||
Object.assign(existing_extension, new_extension)
|
||
|
||
return new_extension
|
||
}
|
||
|
||
|
||
|
||
async function getChromeExtensionsFromPersona({CHROME_EXTENSIONS, CHROME_EXTENSIONS_DIR}) {
|
||
console.log('*************************************************************************')
|
||
console.log(`[⚙️] Installing ${CHROME_EXTENSIONS.length} chrome extensions from CHROME_EXTENSIONS...`)
|
||
try {
|
||
// read extension metadata from filesystem (installing from Chrome webstore if extension is missing)
|
||
for (const extension of CHROME_EXTENSIONS) {
|
||
Object.assign(extension, await loadOrInstallExtension(extension))
|
||
}
|
||
|
||
// for easier debugging, write parsed extension info to filesystem
|
||
await overwriteFile(
|
||
CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.present.json'),
|
||
CHROME_EXTENSIONS,
|
||
)
|
||
} catch(err) {
|
||
console.error(err)
|
||
}
|
||
console.log('*************************************************************************')
|
||
return CHROME_EXTENSIONS
|
||
}
|
||
|
||
let _EXTENSIONS_CACHE = null
|
||
async function getChromeExtensionsFromCache({browser, extensions=CHROME_EXTENSIONS, extensions_dir=CHROME_EXTENSIONS_DIR}) {
|
||
if (_EXTENSIONS_CACHE === null) {
|
||
console.log(`[⚙️] Loading ${CHROME_EXTENSIONS.length} chrome extensions from CHROME_EXTENSIONS...`)
|
||
|
||
// find loaded Extensions at runtime / browser launch time & connect handlers
|
||
// looks at all the open targets for extension service workers / bg pages
|
||
for (const target of browser.targets()) {
|
||
// mutates extensions object in-place to add metadata loaded from filesystem persona dir
|
||
await loadExtensionFromTarget(extensions, target)
|
||
}
|
||
_EXTENSIONS_CACHE = extensions
|
||
|
||
// write installed extension metadata to filesystem extensions.json for easier debugging
|
||
await overwriteFile(
|
||
CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.loaded.json'),
|
||
extensions,
|
||
)
|
||
await overwriteSymlink(
|
||
CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.loaded.json'),
|
||
CHROME_EXTENSIONS_JSON_PATH,
|
||
)
|
||
}
|
||
|
||
return _EXTENSIONS_CACHE
|
||
}
|
||
|
||
async function setup2CaptchaExtension({browser, extensions}) {
|
||
let page = null
|
||
try {
|
||
// open a new tab to finish setting up the 2captcha extension manually using its extension options page
|
||
page = await browser.newPage()
|
||
const { options_url } = extensions.filter(ext => ext.name === 'captcha2')[0]
|
||
await page.goto(options_url)
|
||
await wait(2_500)
|
||
await page.bringToFront()
|
||
|
||
// type in the API key and click the Login button (and auto-close success modal after it pops up)
|
||
await page.evaluate(() => {
|
||
const elem = document.querySelector("input[name=apiKey]") as HTMLInputElement
|
||
elem.value = ""
|
||
})
|
||
await page.type('input[name=apiKey]', API_KEY_2CAPTCHA, { delay: 25 })
|
||
|
||
// toggle all the important switches to ON
|
||
await page.evaluate(() => {
|
||
const checkboxes = Array.from(document.querySelectorAll<HTMLInputElement>('input#isPluginEnabled, input[name*=enabledFor], input[name*=autoSolve]'));
|
||
for (const checkbox of checkboxes) {
|
||
if (!checkbox.checked) checkbox.click()
|
||
}
|
||
})
|
||
|
||
let dialog_opened = false
|
||
page.on('dialog', async (dialog) => {
|
||
setTimeout(async () => {
|
||
await dialog.accept();
|
||
dialog_opened = true
|
||
}, 500);
|
||
})
|
||
await page.click('button#connect')
|
||
await wait(2_500)
|
||
if (!dialog_opened) {
|
||
throw `2captcha extension login confirmation dialog never opened, please check its options page manually: ${options_url}`
|
||
}
|
||
console.log('[🔑] Configured the 2captcha extension using its options page...')
|
||
} catch(err) {
|
||
console.warn(`[❌] Failed to configure the 2captcha extension using its options page!`, err)
|
||
}
|
||
if (page) await page.close()
|
||
}
|
||
|
||
async function speedtest({browser, page, measureUpload=true, timeout=25000}: {browser?: Browser, page?: Page, measureUpload?: boolean, timeout?: number}) {
|
||
// run a speedtest using fast.com, printing results once per second
|
||
|
||
browser = browser || await page.browser()
|
||
page = page || await browser.newPage()
|
||
|
||
// save one speedtest_<date>.json result per day
|
||
const today = versionStrFromDate(new Date(), {withDate: true, withTime: false})
|
||
const SPEEDTEST_PATH = path.join(SPEEDTESTS_DIR, `speedtest_${today}.json`)
|
||
|
||
// check if we've already run one today, if so return earlier results and skip running again
|
||
try {
|
||
return JSON.parse(await fs.promises.readFile(SPEEDTEST_PATH, 'utf-8'))
|
||
} catch(err) {
|
||
// otherwise speedtest does not exist yet for today, continue onwards...
|
||
}
|
||
|
||
console.log('[🚤] Running Speedtest using Fast.com...'.padEnd(82), prettyPath(SPEEDTEST_PATH))
|
||
|
||
await page.goto('https://fast.com', {timeout, waitUntil: 'domcontentloaded'});
|
||
await page.waitForSelector('#speed-value', {timeout})
|
||
|
||
let result = null
|
||
let loop_idx = 0
|
||
|
||
while (loop_idx < 100) {
|
||
result = await page.evaluate(() => {
|
||
const $ = document.querySelector.bind(document);
|
||
|
||
return {
|
||
downloadSpeed: Number($('#speed-value').textContent),
|
||
downloadUnit: $('#speed-units').textContent.trim(),
|
||
downloaded: Number($('#down-mb-value').textContent.trim()),
|
||
uploadSpeed: Number($('#upload-value').textContent),
|
||
uploadUnit: $('#upload-units').textContent.trim(),
|
||
uploaded: Number($('#up-mb-value').textContent.trim()),
|
||
latency: Number($('#latency-value').textContent.trim()),
|
||
bufferBloat: Number($('#bufferbloat-value').textContent.trim()),
|
||
userLocation: $('#user-location').textContent.trim(),
|
||
userIp: $('#user-ip').textContent.trim(),
|
||
isDone: Boolean($('#speed-value.succeeded') && $('#upload-value.succeeded')),
|
||
};
|
||
})
|
||
if (result.downloadSpeed > 0) {
|
||
// console.log(JSON.stringify(result).replaceAll('"', '').replaceAll(',', ' ').replaceAll('{', '').replaceAll('}', ''))
|
||
}
|
||
|
||
if (result.isDone || (!measureUpload && result.uploadSpeed)) {
|
||
break
|
||
}
|
||
|
||
await wait(500)
|
||
loop_idx++
|
||
}
|
||
|
||
await Promise.allSettled([
|
||
page.close(),
|
||
overwriteFile(SPEEDTEST_PATH, result)
|
||
])
|
||
|
||
return result
|
||
}
|
||
|
||
/******************************************************************************/
|
||
/******************************************************************************/
|
||
|
||
const ALREADY_ARCHIVED = new Set(['', 'about:blank', 'chrome://newtab', 'chrome://version'])
|
||
const TASKS_PER_RUN_LIMIT = 200
|
||
|
||
async function botArchiveTask({page, data, url=''}) {
|
||
url = url || data // puppeteer-cluster passes in the url value via the data: arg
|
||
|
||
const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0])
|
||
const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096))
|
||
if (is_unarchivable_url || is_already_archived) return null
|
||
ALREADY_ARCHIVED.add(url.slice(0, 4096))
|
||
|
||
if (ALREADY_ARCHIVED.size > TASKS_PER_RUN_LIMIT) {
|
||
console.warn('[❌] Hit maximum URLs archived per browser session, exiting to free memory.')
|
||
console.warn(' Run this process again to continue with the next batch...')
|
||
process.exit(21)
|
||
}
|
||
|
||
const browser = await page.browser()
|
||
const client = await page.target().createCDPSession()
|
||
const extensions = await getChromeExtensionsFromCache({browser})
|
||
const browser_version = await browser.version()
|
||
const original_url = url.toString()
|
||
const start_time = (new Date())
|
||
|
||
console.log('[0/4]-------------------------------------------------------------------------')
|
||
const snapshot_dir = await setupSnapshotDir({original_url, start_time})
|
||
const snapshot = await setupSnapshotDB({original_url, start_time, snapshot_dir})
|
||
console.log('[1/4]-------------------------------------------------------------------------')
|
||
console.log(`[🪟] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`)
|
||
|
||
|
||
const page_state = {
|
||
// global static state
|
||
browser,
|
||
client,
|
||
browser_version,
|
||
extensions,
|
||
|
||
// per-page static metadata
|
||
original_url,
|
||
snapshot,
|
||
snapshot_dir,
|
||
start_time: start_time.toISOString(),
|
||
start_ts: Number(start_time),
|
||
version: versionStrFromDate(start_time),
|
||
|
||
// per-page mutable archiving state
|
||
main_response: null,
|
||
recorder: null,
|
||
console_log: [],
|
||
traffic_log: {},
|
||
redirects: {},
|
||
}
|
||
page._original_url = original_url
|
||
|
||
try {
|
||
// run all page setup functions in parallel
|
||
const results = await Promise.allSettled([
|
||
// loadAuthStorage(page, page_state, { apply: true }),
|
||
startMetadataRecording(page, page_state),
|
||
setupURLRewriting(page, page_state),
|
||
// setupViewport(page, page_state),
|
||
setupModalAutoClosing(page, page_state),
|
||
loadCloudflareCookie(page, page_state),
|
||
startResponseSaving(page, page_state),
|
||
saveYTDLP(page, page_state),
|
||
saveGALLERYDL(page, page_state),
|
||
// saveSourceMaps(page, page_state),
|
||
// TODO: someday setup https://github.com/osnr/TabFS ?
|
||
]);
|
||
// run all page setup functions in parallel
|
||
const rejected = results
|
||
.filter(result => result.status === 'rejected')
|
||
.map(result => (result as PromiseRejectedResult).reason);
|
||
if (rejected.length) console.warn('[⚠️] Partial failures during page setup:', rejected);
|
||
} catch(err) {
|
||
console.error('[❌] PAGE SETUP ERROR', JSON.stringify(err, null, 4))
|
||
return
|
||
}
|
||
|
||
|
||
console.log('[2/4]-------------------------------------------------------------------------')
|
||
|
||
console.log('[➡️] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset)
|
||
const startrecording_promise = startScreenrecording(page, page_state)
|
||
page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000})
|
||
try {
|
||
const results = await Promise.allSettled([
|
||
startrecording_promise,
|
||
page.bringToFront(),
|
||
page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}),
|
||
])
|
||
const rejected = results
|
||
.filter(result => result.status === 'rejected')
|
||
.map(result => (result as PromiseRejectedResult).reason)
|
||
if (rejected.length) console.warn('[⚠️] Parial failures during page load:', rejected)
|
||
} catch(err) {
|
||
console.error('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4))
|
||
return
|
||
}
|
||
|
||
if (page_state.main_response === null) {
|
||
page_state.main_response = await page.waitForResponse(() => true)
|
||
}
|
||
assert(page_state.main_response)
|
||
if (page_state.main_response.status() == 429) {
|
||
throw `[⚠️] Got 429 rate-limit response, skipping this URL for now...`
|
||
}
|
||
|
||
// emulate human browsing behavior
|
||
// await disableAnimations(page, page_state);
|
||
await jiggleMouse(page, page_state);
|
||
await solveCaptchas(page, page_state);
|
||
await blockRedirects(page, page_state);
|
||
await scrollDown(page, page_state);
|
||
// await expandComments(page, page_state);
|
||
await submitForm(page, page_state);
|
||
// await blockJSExecution(page, page_state);
|
||
|
||
console.log('[3/4]-------------------------------------------------------------------------')
|
||
|
||
// stop tampering with page requests & JS / recording metadata / traffic log
|
||
await stopMetadataRecording(page, page_state)
|
||
|
||
// do all synchonous archiving steps that need exclusive use of the whole page while doing stuff
|
||
const saveScreenrecording_promise = saveScreenrecording(page, page_state);
|
||
await saveScreenshot(page, page_state);
|
||
await savePDF(page, page_state);
|
||
|
||
console.log('[4/4]-------------------------------------------------------------------------')
|
||
|
||
// do all async archiving steps that can be run at the same time
|
||
await inlineShadowDOM(page, page_state);
|
||
const results = await Promise.allSettled([
|
||
saveTitle(page, page_state),
|
||
saveSEO(page, page_state),
|
||
saveFavicon(page, page_state),
|
||
saveSSL(page, page_state),
|
||
saveRequests(page, page_state),
|
||
saveRedirects(page, page_state),
|
||
saveHeaders(page, page_state),
|
||
saveRaw(page, page_state),
|
||
saveDOM(page, page_state),
|
||
saveBodyText(page, page_state),
|
||
// savePandoc(page, page_state),
|
||
saveReadability(page, page_state),
|
||
saveAccessibility(page, page_state),
|
||
saveOutlinks(page, page_state),
|
||
// saveAuthStorage(page, page_state),
|
||
saveAIQualityAssuranceResult(page, page_state),
|
||
]);
|
||
|
||
// do all sync archiving steps that require browser extensions at the very end (they are the buggiest)
|
||
const bg_results = Promise.allSettled([
|
||
saveScreenrecording_promise,
|
||
saveSinglefile(page, page_state),
|
||
// saveArchiveWebPage(page, page_state),
|
||
// savePocket(page, page_state),
|
||
])
|
||
|
||
const {duration} = await saveMetrics(page, page_state);
|
||
|
||
const rejected = results
|
||
.filter(result => result.status === 'rejected')
|
||
.map(result => (result as PromiseRejectedResult).reason) // not sure why this has a ts-error, .reason does exist on rejected promises
|
||
|
||
if (rejected.length)
|
||
console.warn('[⚠️] Parial failures during archiving:', rejected)
|
||
|
||
// Start an interactive REPL here with the `page` instance.
|
||
// https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl
|
||
// await page.repl()
|
||
// await page.browser().repl()
|
||
|
||
console.log(`[✅] ${ANSI.blue}Finished archiving in ${duration/1000}s.${ANSI.reset}`)
|
||
|
||
try {
|
||
const rejected = (await bg_results)
|
||
.filter(result => result.status === 'rejected')
|
||
.map(result => (result as PromiseRejectedResult).reason) // not sure why this has a ts-error, .reason does exist on rejected promises
|
||
if (rejected.length)
|
||
console.warn('[⚠️] Parial failures during wrap-up tasks:', rejected)
|
||
|
||
console.log('[🗑️] Resetting to about:blank to ensure memory is freed...')
|
||
await page.goto('about:blank')
|
||
await page.close()
|
||
} catch(err) {
|
||
console.log(err)
|
||
}
|
||
|
||
// symlink the best results from across all the versions/ into the snapshot dir root
|
||
await symlinkBestSnapshotResults(snapshot_dir)
|
||
|
||
// display latest version screenshot GIF
|
||
console.log()
|
||
try {
|
||
const latest_version_gif = path.join(snapshot_dir, 'versions', page_state.version, path.basename(SCREENRECORDGIF_PATH(page)))
|
||
const dirent = await blockUntilExists(latest_version_gif, {min_bytes: 100, timeout: 15_000})
|
||
child_process.spawn('/Users/squash/.iterm2/imgcat', [dirent.abspath], {stdio: [null, 'inherit', 'inherit']})
|
||
} catch(err) {
|
||
console.warn('[⚠️] Failed to display screenrecording.gif...', err)
|
||
console.log()
|
||
}
|
||
|
||
// determine whether task succeeded or failed based on AI QA score
|
||
const latest_version_aiqa = path.join(snapshot_dir, 'versions', page_state.version, path.basename(AIQA_PATH(page)))
|
||
const qa_results = JSON.parse((await fs.promises.readFile(latest_version_aiqa)).toString())
|
||
if (qa_results.pct_visible < 50) {
|
||
throw `[❌] Task completed with problems, got AI QA score of ${qa_results.pct_visible}%! ${qa_results.warnings.join(', ')} ${qa_results.error_text || ''}`
|
||
} else {
|
||
console.log(`[💫] Task completed succesfully: ${qa_results.pct_visible}% ${qa_results.warnings.join(', ') || ''}`)
|
||
console.log(` Summary: ${(qa_results.main_content_title || qa_results.description || 'No title/description detected').substring(0, 80)}... ${qa_results.main_content_author || ''} ${qa_results.main_content_date || ''}`)
|
||
return true
|
||
}
|
||
}
|
||
|
||
async function passiveArchiveTask({browser, page, url}) {
|
||
// archive passively (e.g. a tab that was opened already by a human), without changing the active page
|
||
|
||
const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0])
|
||
const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096))
|
||
if (is_unarchivable_url || is_already_archived) return null
|
||
ALREADY_ARCHIVED.add(url.slice(0, 4096))
|
||
|
||
// these have to be as early as possible because we're racing with the page load (we might even be too late)
|
||
// jk nevermind, we now re-open a new bg tab for every tab that's created to re-capture the initial request
|
||
// await page.setRequestInterception(true);
|
||
// await page.setCacheEnabled(false);
|
||
|
||
const original_url = url.toString()
|
||
const start_time = (new Date())
|
||
const browser_version = await browser.version()
|
||
|
||
console.log('------------------------------------------------------------------------------')
|
||
console.log('[➕] Starting archive of new tab opened in driver browser...', await browser.version())
|
||
const snapshot_dir = await setupSnapshotDir({original_url, start_time})
|
||
const snapshot = await setupSnapshotDB({ original_url, start_time, snapshot_dir })
|
||
console.log('------------------------------------------------------------------------------')
|
||
console.log(`[🪟] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`)
|
||
|
||
// create a new page in the background for archiving
|
||
const old_page = page
|
||
page = await browser.newPage()
|
||
await old_page.bringToFront()
|
||
const client = await page.target().createCDPSession()
|
||
const extensions = await getChromeExtensionsFromCache({ browser })
|
||
|
||
const page_state = {
|
||
// global static state
|
||
browser,
|
||
client,
|
||
browser_version,
|
||
extensions,
|
||
|
||
// per-page static metadata
|
||
original_url,
|
||
snapshot,
|
||
snapshot_dir,
|
||
start_time: start_time.toISOString(),
|
||
start_ts: Number(start_time),
|
||
version: versionStrFromDate(start_time),
|
||
|
||
// per-page mutable archiving state
|
||
main_response: null,
|
||
recorder: null,
|
||
console_log: [],
|
||
traffic_log: {},
|
||
redirects: {},
|
||
}
|
||
page._original_url = original_url
|
||
|
||
try {
|
||
|
||
// run all page setup functions in parallel
|
||
const results = await Promise.allSettled([
|
||
// loadAuthStorage(page, page_state, {apply: true}),
|
||
startMetadataRecording(page, page_state),
|
||
setupURLRewriting(page, page_state),
|
||
startResponseSaving(page, page_state),
|
||
saveYTDLP(page, page_state),
|
||
saveGALLERYDL(page, page_state),
|
||
// saveSourceMaps(page, page_state),
|
||
]);
|
||
const rejected = results
|
||
.filter(result => result.status === 'rejected')
|
||
.map(result => (result as PromiseRejectedResult).reason)
|
||
if (rejected.length) console.warn('[⚠️] Parial failures during page setup:', rejected)
|
||
} catch(err) {
|
||
console.warn('[❌] ERROR DURING PAGE SETUP', JSON.stringify(err, null, 4))
|
||
return
|
||
}
|
||
|
||
// load the url in the background page, then switch to it once its loaded and close the original tab
|
||
console.log('[➡️] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset)
|
||
const startrecording_promise = startScreenrecording(page, page_state)
|
||
page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000})
|
||
|
||
// for debugging
|
||
globalThis.page = page
|
||
globalThis.page_state = page_state
|
||
|
||
// start loading the page, start screenrecording, close the old page, and wait for loading to finish (all at once, fine for these to race)
|
||
try {
|
||
const results = await Promise.allSettled([
|
||
startrecording_promise,
|
||
page.bringToFront(),
|
||
old_page.close(),
|
||
page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}),
|
||
])
|
||
const rejected = results
|
||
.filter(result => result.status === 'rejected')
|
||
.map(result => (result as PromiseRejectedResult).reason)
|
||
if (rejected.length) console.warn('[⚠️] Parial failures during [age load:', rejected)
|
||
} catch(err) {
|
||
console.warn('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4))
|
||
return
|
||
}
|
||
|
||
if (page_state.main_response === null) {
|
||
page_state.main_response = await page.waitForResponse(() => true)
|
||
}
|
||
assert(page_state.main_response)
|
||
if (page_state.main_response.status() == 429) {
|
||
throw `[⚠️] Got 429 rate-limit response, skipping this URL for now...`
|
||
}
|
||
|
||
// resume page if paused by waitForDebuggerOnStart/dev tools debugger/backgrounding
|
||
try {
|
||
await client.send('Page.enable');
|
||
await client.send('Page.setWebLifecycleState', {state: 'active'});
|
||
await client.send('Runtime.runIfWaitingForDebugger')
|
||
} catch(err) { /* console.warn(err) */ }
|
||
|
||
// wait a couple seconds for page to finish loading
|
||
await wait(5_000)
|
||
|
||
// emulate human browsing behavior
|
||
// await disableAnimations(page, page_state);
|
||
// await jiggleMouse(page, page_state);
|
||
await solveCaptchas(page, page_state);
|
||
// await blockRedirects(page, page_state);
|
||
// await scrollDown(page, page_state);
|
||
// await expandComments(page, page_state);
|
||
await submitForm(page, page_state);
|
||
// await blockJSExecution(page, page_state);
|
||
await stopMetadataRecording(page, page_state) // stop tampering with page requests & JS
|
||
|
||
console.log('[3/4]-------------------------------------------------------------------------')
|
||
|
||
// do all synchonous archiving steps that need exclusive use of the whole page while doing stuff
|
||
const saveScreenrecording_promise = saveScreenrecording(page, page_state);
|
||
await saveScreenshot(page, page_state);
|
||
await savePDF(page, page_state);
|
||
|
||
console.log('[4/4]-------------------------------------------------------------------------')
|
||
|
||
// do all async archiving steps that can be run at the same time
|
||
await inlineShadowDOM(page, page_state);
|
||
const results = await Promise.allSettled([
|
||
saveTitle(page, page_state),
|
||
saveSEO(page, page_state),
|
||
saveFavicon(page, page_state),
|
||
saveSSL(page, page_state),
|
||
saveRequests(page, page_state),
|
||
saveRedirects(page, page_state),
|
||
saveHeaders(page, page_state),
|
||
saveRaw(page, page_state),
|
||
saveDOM(page, page_state),
|
||
saveBodyText(page, page_state),
|
||
// savePandoc(page, page_state),
|
||
saveReadability(page, page_state),
|
||
saveAccessibility(page, page_state),
|
||
saveOutlinks(page, page_state),
|
||
// saveAuthStorage(page, page_state),
|
||
saveAIQualityAssuranceResult(page, page_state),
|
||
]);
|
||
|
||
// do all sync archiving steps that require browser extensions at the very end (they are the buggiest)
|
||
const bg_results = Promise.allSettled([
|
||
saveScreenrecording_promise,
|
||
saveSinglefile(page, page_state),
|
||
// saveArchiveWebPage(page, page_state),
|
||
// savePocket(page, page_state),
|
||
])
|
||
|
||
const {duration} = await saveMetrics(page, page_state);
|
||
|
||
const rejected = results
|
||
.filter(result => result.status === 'rejected')
|
||
.map(result => (result as PromiseRejectedResult).reason)
|
||
|
||
if (rejected.length)
|
||
console.warn('[⚠️] Parial failures during page archiving:', rejected)
|
||
|
||
// Start an interactive REPL here with the `page` instance.
|
||
// https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl
|
||
// await page.repl()
|
||
// await page.browser().repl()
|
||
|
||
console.log(`[✅] Finished archiving in ${duration/1000}s.`,)
|
||
|
||
// await page.tracing.stop();
|
||
try {
|
||
const rejected = (await bg_results)
|
||
.filter(result => result.status === 'rejected')
|
||
.map(result => (result as PromiseRejectedResult).reason)
|
||
if (rejected.length)
|
||
console.warn('[⚠️] Parial failures during page wrap-up tasks:', rejected)
|
||
} catch(err) {
|
||
console.log(err)
|
||
}
|
||
await symlinkBestSnapshotResults(snapshot_dir)
|
||
}
|
||
|
||
|
||
/******************************************************************************/
|
||
/************************* Page Setup Tasks ***********************************/
|
||
|
||
|
||
|
||
async function setupSnapshotDir({original_url, start_time, snapshot_dir=null}) {
|
||
// setup archive/<id> snapshot output folder, move old files into versions/<date>/* + clear any existing symlinks
|
||
|
||
const snap_dir = snapshot_dir || TASK_PATH(original_url)
|
||
|
||
console.log()
|
||
console.log()
|
||
console.log(ANSI.blue + original_url + ANSI.reset)
|
||
console.log(ANSI.black + snap_dir + ANSI.reset)
|
||
console.log()
|
||
console.log('[📂] Setting up Snapshot output directory...'.padEnd(82), prettyPath(snap_dir))
|
||
|
||
// check for existing data at old legacy paths e.g. ./data/archive/1999999999.1723425
|
||
const hacky_dir = path.join(ARCHIVE_DIR, `1999999999.${hashCode(original_url)}`)
|
||
const known_dir = SNAPSHOT_DIRS_BY_URL[original_url]
|
||
|
||
const known_dir_exists = fs.existsSync(known_dir)
|
||
const hacky_dir_exists = fs.existsSync(hacky_dir)
|
||
|
||
if (snap_dir == hacky_dir) {
|
||
if (known_dir_exists) {
|
||
throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${known_dir}!`
|
||
}
|
||
} else if (snap_dir == known_dir) {
|
||
if (hacky_dir_exists) {
|
||
throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${hacky_dir}!`
|
||
}
|
||
} else {
|
||
if (known_dir_exists) {
|
||
throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${known_dir}!`
|
||
} else if (hacky_dir_exists) {
|
||
throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${hacky_dir}!`
|
||
} else {
|
||
throw `Tried to create snapshot in ${snap_dir} but its not a recognized snapshot dir path:\n - ${known_dir}\n - ${hacky_dir}`
|
||
}
|
||
}
|
||
|
||
// mkdir -p ./data/archive/<snap_id>/versions && cd ./data/archive/<snap_id>
|
||
await fs.promises.mkdir(path.join(snap_dir, 'versions'), {recursive: true})
|
||
process.chdir(snap_dir)
|
||
|
||
// clear any /data/archive/<snap_id>/*.* symlinks pointing to existing ./versions/<versionid>/*.* files
|
||
await clearSnapshotDirSymlinks(snap_dir)
|
||
|
||
// move /data/archive/<snap_id>/*.* loose output files from any prior run into ./versions/<versionid>/*.*
|
||
await collectSnapshotDirVersionFiles(snap_dir)
|
||
|
||
// update /data/indexes/<index_name>/* to include references to /data/archive/<snap_id> as-needed
|
||
await updateSnapshotDirIndexes(snap_dir, {original_url, start_time})
|
||
|
||
// assert /data/archive/<snap_id>/ contains no invalid/partial files + is empty/ready to receive new files
|
||
await assertSnapshotDirIsValid(snap_dir, {is_empty: true})
|
||
|
||
return snap_dir
|
||
}
|
||
|
||
// ./index/<index_name> : index_getter(page_state) => "<index_key_str>"
|
||
const INDEXES = {
|
||
snapshots_by_day: ({start_time}) =>
|
||
versionStrFromDate(start_time, {withDate: true, withTime: false}),
|
||
snapshots_by_domain: ({original_url}) =>
|
||
(new URL(original_url)).hostname || '', // hostname does not include :port
|
||
}
|
||
|
||
async function updateSnapshotDirIndexes(snap_dir, page_state, indexes=INDEXES, indexes_dir=INDEXES_DIR) {
|
||
assert(indexes)
|
||
console.log(`[🔎] Linking Snapshot in indexes (${Object.keys(indexes).join(', ')})...`)
|
||
// const {snapshot_dir, original_url, start_ts} = page_state
|
||
for (const [index_name, index_key_getter] of Object.entries(indexes)) {
|
||
const index_entry = await indexSnapshotDir(snap_dir, {index_name, index_key_getter, indexes_dir}, page_state)
|
||
}
|
||
}
|
||
|
||
async function indexSnapshotDir(snap_dir, {index_name, index_key_getter, indexes_dir=INDEXES_DIR}, page_state) {
|
||
// place symlinks to this snapshot in any /indexes/<index_name/ indexes as-needed
|
||
// const snap_id = snap_dir.split('/').at(-1)
|
||
|
||
const index_dir = path.join(indexes_dir, index_name) // /data/index/snapshots_by_day
|
||
await fs.promises.mkdir(index_dir, {recursive: true})
|
||
|
||
// calculate the index key, e.g. "200101231" or "example.com"
|
||
assert(index_name && index_key_getter)
|
||
assert(page_state)
|
||
const index_key = String(index_key_getter(page_state)) // '20010131'
|
||
assert(index_key)
|
||
const snap_id = path.parse(snap_dir).base // '19999999.23423523'
|
||
assert(snap_id)
|
||
|
||
const index_entries_dir = path.join(index_dir, index_key) // /data/index/snapshots_by_day/20010131
|
||
await fs.promises.mkdir(index_entries_dir, {recursive: true})
|
||
|
||
const symlink_path = path.join(index_entries_dir, snap_id) // /data/index/snapshots_by_day/20010131/19999999.23423523
|
||
|
||
// create symlink index/snapshots_by_day/<YYYYMMDD>/<snap id> -> ./archive/<snap_id> symlink
|
||
const {symlink_abspath} = await overwriteSymlink(snap_dir, symlink_path, {relative: true, mkdirs: false})
|
||
}
|
||
|
||
|
||
async function collectSnapshotDirVersionFiles(snap_dir) {
|
||
// move archive/<id>/*.* snapshot output files into archive/<id>/versions/<date>/* dated version folder
|
||
|
||
// detect start time / version info from previous result metrics.json
|
||
const snap_id = snap_dir.split('/archive/').at(-1)
|
||
const existing_metrics = path.join(snap_dir, 'metrics.json')
|
||
let {start_time, VERSION} = {start_time: '1970-01-01T00:00:00.000Z', VERSION: '19700101000000'}
|
||
try {
|
||
;({start_time, VERSION} = JSON.parse(await fs.promises.readFile(existing_metrics, 'utf-8')));
|
||
} catch(err) {
|
||
// continue normally, overwriting existing files is fine if they're broken to begin with
|
||
}
|
||
|
||
// create new version folder based on metrics.json start_time (or epoch time as fallback for legacy output)
|
||
const version_dir_name = VERSION || versionStrFromDate(start_time)
|
||
const version_dir = path.join(snap_dir, 'versions', version_dir_name)
|
||
await fs.promises.mkdir(version_dir, {recursive: true})
|
||
|
||
// move all result files from snapshot_dir root into version folder
|
||
const existing_snapshot_files =
|
||
(await fs.promises.readdir(snap_dir, {withFileTypes: true}))
|
||
.filter(dirent => {
|
||
if (dirent.name.startsWith('.')) return false // ignore hidden files, dont version them
|
||
if (dirent.name == 'versions') return false // dont try to move versions folder into itself
|
||
if (dirent.isSymbolicLink()) return false // skip existing symbolic links
|
||
return (dirent.isFile() || dirent.isDirectory()) // dont try to version sockets/FIFOs/devs etc.
|
||
})
|
||
|
||
if (existing_snapshot_files.length) {
|
||
console.log(`[📅] Moving snapshot results into version dir: ./data/archive/${snap_id}/* ->`.padEnd(82), `./data/archive/${snap_id}/versions/${VERSION}/`)
|
||
}
|
||
|
||
const snapshot_files = await getDirInfo(snap_dir, {withRoot: false, filter: ({relpath}) => !relpath.startsWith('versions')})
|
||
const version_files = await getDirInfo(version_dir, {withRoot: false})
|
||
|
||
for (const {name} of existing_snapshot_files) {
|
||
const snapdir_entry_abspath = path.join(snap_dir, name)
|
||
const versioned_entry_abspath = path.join(version_dir, name)
|
||
|
||
const snapshot_entry = snapshot_files[name]
|
||
const version_entry = version_files[name]
|
||
|
||
if (snapshot_entry && version_entry) {
|
||
// a conflicting file/dir already exists in the destination path
|
||
// we have a few options here, we can try to merge them, or we can create a new version
|
||
|
||
if (snapshot_entry.sha256 == version_entry.sha256) {
|
||
// both are the same already, delete the duplicate (leaving the copy inside the version dir)
|
||
// if (snapshot_entry.is_dir) {
|
||
// await fs.promises.rmdir(snapshot_entry.abspath, {recursive: true})
|
||
// } else {
|
||
// await fs.promises.unlink(snapshot_entry.abspath)
|
||
// }
|
||
// console.warn(`[!] Found harmless exact duplicate files, leaving as is: ${snapshot_entry.summary} and ${version_entry.summary}`)
|
||
} else {
|
||
// both are different,
|
||
if (snapshot_entry.num_bytes > version_entry.num_bytes) {
|
||
// snapshot entry is bigger, keep it and delete version entry?
|
||
} else {
|
||
// version entry is bigger, keep it and delete snapshot entry
|
||
}
|
||
console.warn(' ', snapshot_entry.summary)
|
||
console.warn(' ', version_entry.summary)
|
||
// throw `Found conflicting duplicate files with different contents: ${name}`
|
||
}
|
||
} else {
|
||
// mv ./data/archive/<snap_id>/example.txt -> ./data/archive/<snap_id>/versions/<version_id>/example.txt
|
||
await fs.promises.rename(snapdir_entry_abspath, versioned_entry_abspath)
|
||
console.log(` ↣ ${prettyPath(snapdir_entry_abspath)} ->`.padEnd(82), prettyPath(versioned_entry_abspath))
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extractor definition
|
||
// {
|
||
// phase: setup | load | sync1 | async1 | sync2 | close
|
||
// name: 'media' | 'photos', 'wget', 'singlefile'
|
||
//
|
||
// shouldRun(page, page_state)
|
||
|
||
// pageSetup
|
||
// pageLoad
|
||
// pageInteraction clicking around/scrolling
|
||
// archivePhase1 sync
|
||
// archivePhase2 async
|
||
// archivePhase3 async
|
||
// pageClose
|
||
|
||
// execute(page, page_state)
|
||
// validateResult(page, page_state)
|
||
// }
|
||
|
||
async function clearSnapshotDirSymlinks(snap_dir) {
|
||
// delete all archive/<id>/* symlinks in preparation for new snapshot output to be placed there
|
||
|
||
const existing_symlinks =
|
||
(await fs.promises.readdir(snap_dir, {withFileTypes: true}))
|
||
.filter(dirent => {
|
||
if (dirent.name.startsWith('.')) return false // ignore hidden files, dont version them
|
||
if (dirent.name == 'versions') return false // dont try to move versions folder into itself
|
||
return dirent.isSymbolicLink()
|
||
})
|
||
|
||
for (const {name: existing_symlink} of existing_symlinks) {
|
||
await fs.promises.unlink(path.join(snap_dir, existing_symlink))
|
||
// if symlinks are not cleared before starting, it can cause issues with outputs writing into previous versions folders
|
||
// e.g. screerecording saves to ./media which could be pointing to previous version's ./versions/<olddate>/media
|
||
}
|
||
}
|
||
|
||
async function symlinkBestSnapshotResults(snap_dir) {
|
||
// move any existing files into versions/<date> folder (clear out main folder)
|
||
// symlink latest files from versions/<date>/* into main folder
|
||
|
||
await fs.promises.mkdir(path.join(snap_dir, 'versions'), {recursive: true})
|
||
process.chdir(snap_dir)
|
||
|
||
const metrics_file = path.join(snap_dir, 'metrics.json')
|
||
// if (!fs.existsSync(metrics_file) || (await fs.promises.lstat(metrics_file)).isSymbolicLink()) {
|
||
// console.warn('[⚠️] Warning, found partial dirty snapshot state (did the snapshot get interrupted?)', snap_dir)
|
||
// }
|
||
|
||
// move output files into versioned folder
|
||
await collectSnapshotDirVersionFiles(snap_dir)
|
||
|
||
// clear any existing symlinks
|
||
await clearSnapshotDirSymlinks(snap_dir)
|
||
|
||
// assert task dir is empty and contains no bare files that might get overwritten, also asserts version dirs are valid
|
||
await assertSnapshotDirIsValid(snap_dir, {is_empty: true})
|
||
|
||
|
||
const version_dirs = (await fs.promises.readdir(path.join(snap_dir, 'versions'))).sort() // earliest to latest
|
||
const most_recent = version_dirs.at(-1)
|
||
|
||
// for each version dir in versions/ (oldest -> newest)
|
||
for (const version_dir of version_dirs) {
|
||
if (version_dir.startsWith('.')) continue
|
||
|
||
const version_dir_abspath = path.join(snap_dir, 'versions', version_dir)
|
||
const version_dir_files = (
|
||
(await fs.promises.readdir(version_dir_abspath))
|
||
.filter(filename => !filename.startsWith('.')))
|
||
|
||
// iterate through all the files/folders in the version dir
|
||
for (const filename of version_dir_files) {
|
||
const snapdir_entry = path.join(snap_dir, filename) // ./data/archive/<snapid>/filename
|
||
const versiondir_entry = path.join(snap_dir, 'versions', version_dir, filename) // ./data/archive/<snapid>/versions/<versionid>/filename
|
||
|
||
if (fs.existsSync(snapdir_entry)) {
|
||
// if an entry already exists in the snapshot root for this filename
|
||
if ((await fs.promises.lstat(snapdir_entry)).isSymbolicLink()) {
|
||
// if a symlink already exists in the root with the same name,
|
||
// check if the version file we're looking at is a better candidate to replace it
|
||
|
||
const existing_abspath = await fs.promises.realpath(snapdir_entry)
|
||
const desired_abspath = path.join(version_dir_abspath, filename)
|
||
if (existing_abspath != desired_abspath) {
|
||
// check if the new candidate is larger or if the existing symlink is larger (largest file = most likely to be highest quality capture data)
|
||
const largest_path = await getLargestPath(existing_abspath, desired_abspath)
|
||
if (largest_path != (await fs.promises.realpath(existing_abspath))) {
|
||
const larger_version = path.basename(path.dirname(largest_path))
|
||
const larger_abspath = path.join(snap_dir, 'versions', larger_version, filename)
|
||
|
||
// console.log(' - swapping for larger file:', filename, '->', larger_abspath.split('/archive/').at(-1))
|
||
await overwriteSymlink(larger_abspath, snapdir_entry, {search_limit: snap_dir})
|
||
} else {
|
||
// console.log(' - leaving larger file:', largest_path.split('/archive/').at(-1))
|
||
}
|
||
} else {
|
||
// leave existing symlink pointing to current version file, nothing to change
|
||
// console.log(' - leaving current file:', existing_abspath.split('/archive/').at(-1))
|
||
}
|
||
} else {
|
||
// clearSnapshotDirSymlinks() should have already cleared these files out!
|
||
throw `Non-symlink file found in root of snapshot dir! Refusing to overwrite: ${prettyPath(snapdir_entry)}`
|
||
}
|
||
} else {
|
||
// no entry exists in the snapshot root for this filename, create one by linking to the version file
|
||
await overwriteSymlink(versiondir_entry, snapdir_entry, {search_limit: snap_dir})
|
||
}
|
||
// if (version_dir == most_recent) {
|
||
// // only log most recent links even though we link older ones too (otherwise its too noisy)
|
||
// console.log(` 🔗 ./${filename} -> ./${versiondir_entry} linking...`)
|
||
// }
|
||
}
|
||
}
|
||
|
||
return snap_dir
|
||
}
|
||
|
||
async function assertSnapshotDirIsValid(snap_dir, {is_empty=false}={}) {
|
||
process.chdir(snap_dir)
|
||
console.log()
|
||
console.log(`[☑️] Checking that snapshot records are valid...`)
|
||
|
||
// get all directory entries in archive/<snapshot_id>/*
|
||
const snapshot_dir_entries =
|
||
(await fs.promises.readdir(snap_dir, {withFileTypes: true}))
|
||
.filter(dirent => {
|
||
if (dirent.name.startsWith('.')) return false
|
||
if (dirent.name == 'versions') return false
|
||
})
|
||
|
||
// assert versions folder exists and is not a symbolic link
|
||
const versions_dir = path.join(snap_dir, 'versions')
|
||
assert(fs.existsSync(versions_dir))
|
||
assert(!(await fs.promises.lstat(versions_dir)).isSymbolicLink())
|
||
|
||
// if it should be empty, check that no loose files exist
|
||
if (is_empty) {
|
||
assert(!snapshot_dir_entries.length, `Found loose files in snapshot-dir that shouldn't be there! ${snap_dir}`)
|
||
}
|
||
|
||
// assert all non-hidden files in snapshot dir are symbolic links to actual data in versions/<date>/*
|
||
for (const snapshot_dir_entry of snapshot_dir_entries) {
|
||
if (snapshot_dir_entry.name.startsWith('.')) continue
|
||
if (snapshot_dir_entry.name == 'versions') continue
|
||
assert(snapshot_dir_entry.isSymbolicLink(), `Found non-symbolic link in root of snapshot dir! ${snap_dir}/${snapshot_dir_entry.name}`)
|
||
assert(fs.existsSync(snapshot_dir_entry.name), `Found broken symbolic link in root of snapshot dir! ${snap_dir}/${snapshot_dir_entry.name}`)
|
||
}
|
||
|
||
const version_entries = (
|
||
(await fs.promises.readdir(versions_dir))
|
||
.filter(foldername => !foldername.startsWith('.'))
|
||
.sort())
|
||
|
||
console.log(` √ ${prettyPath(versions_dir)}`, version_entries.length)
|
||
|
||
for (const version_dir of version_entries) {
|
||
await assertVersionDirIsValid(path.join(versions_dir, version_dir))
|
||
}
|
||
|
||
// write snapshot dir file listing w/ sizes & hashes to .files.json
|
||
const directory_info = await getDirInfo(snap_dir, {withRoot: true, withHelpers: false, maxdepth: 3})
|
||
await overwriteFile(path.join(snap_dir, '.files.json'), directory_info)
|
||
}
|
||
|
||
async function assertVersionDirIsValid(version_dir) {
|
||
const dirname = path.parse(version_dir).name
|
||
assert(fs.existsSync(version_dir), `Version dir does not exist: ${prettyPath(version_dir)}`)
|
||
|
||
const dirent = await fs.promises.lstat(version_dir)
|
||
assert(dirent.isDirectory() && !dirent.isSymbolicLink(), `Found non-directory in versions dir! ${prettyPath(version_dir)}`)
|
||
|
||
const unix_epoch = '19700101000000'
|
||
const is_name_valid_datestr = /^\d+$/.test(dirname) && (dirname.length == 14) && (dirname.startsWith('2') || dirname == unix_epoch) && parseVersionDateStr(dirname)
|
||
assert(is_name_valid_datestr, `Version directories must be a 14-character long date string like 20251231235959! ${dirname}`)
|
||
|
||
// get all directory entries in archive/<snapshot_id>/versions/<version_id>/*
|
||
const version_dir_entries = (
|
||
(await fs.promises.readdir(version_dir, {withFileTypes: true}))
|
||
.filter((dirent) => !dirent.name.startsWith('.')))
|
||
|
||
// assert version dir contains only actual snapshot output files (not-symbolic links or other version dirs)
|
||
for (const version_dir_entry of version_dir_entries) {
|
||
assert(version_dir_entry.name != 'versions', `Version dir cannot contain another versions folder! ${prettyPath(version_dir)}/versions`)
|
||
assert(!version_dir_entry.isSymbolicLink(), `Version dir cannot contain symbolic link! ${prettyPath(version_dir)}/${version_dir_entry.name}`)
|
||
}
|
||
|
||
// color highlight the unix epoch version in black, and any version created today in blue
|
||
let pretty_dirname = dirname
|
||
if (dirname == unix_epoch) {
|
||
pretty_dirname = ANSI.black + unix_epoch + ANSI.reset
|
||
}
|
||
const today = versionStrFromDate(new Date(), {withDate: true, withTime: false})
|
||
if (dirname.startsWith(today)) {
|
||
pretty_dirname = ANSI.blue + dirname + ANSI.reset
|
||
}
|
||
|
||
// write version dir file listing w/ sizes & hashes to .files.json
|
||
const directory_info = await getDirInfo(version_dir, { withRoot: true, withHelpers: false, maxdepth: 3 })
|
||
await overwriteFile(path.join(version_dir, '.files.json'), directory_info)
|
||
|
||
console.log(` √ ./versions/${pretty_dirname} contains`, version_dir_entries.length, 'results')
|
||
}
|
||
|
||
async function setupSnapshotDB({ original_url, start_time, snapshot_dir }) {
|
||
// setup Snapshot database row, finding it if it already exists or creating a new one
|
||
|
||
const timestamp = snapshot_dir.split('/').at(-1)
|
||
const search_attrs = { url: original_url, timestamp }
|
||
const update_attrs = { url: original_url, timestamp, added: start_time, title: null }
|
||
|
||
let snapshot = await Snapshot.findOne({ where: search_attrs });
|
||
let created = false
|
||
if (!snapshot) {
|
||
snapshot = await Snapshot.findOne({ where: {url: original_url} });
|
||
if (snapshot) {
|
||
// console.warn(`[X] Found DB Snapshot [${timestamp}](${original_url.substring(0, 30)}...) that has different timestamp from existing dir ${prettyPath(snapshot_dir)}!`)
|
||
// throw 'Snapshot DB record does not match filesystem path!'
|
||
} else {
|
||
console.log(`[+] Creating new DB Snapshot [${timestamp}](${original_url.substring(0, 30)}...) for ${prettyPath(snapshot_dir)}...`)
|
||
// ;([snapshot, created] = await Snapshot.findOrCreate({where: search_attrs, defaults: update_attrs }));
|
||
// throw 'Wanted to create new Snapshot but refusing to modify DB during testing!'
|
||
}
|
||
}
|
||
|
||
// assert(snapshot && (snapshot instanceof Snapshot))
|
||
return snapshot
|
||
}
|
||
|
||
async function setupViewport(page, _page_state) {
|
||
// setup viewport
|
||
await page.setViewport(DEFAULT_VIEWPORT);
|
||
await page.setGeolocation(DEFAULT_GEOLOCATION);
|
||
// await page.setBypassCSP(true); // bypass CSP restrictions (requires --disable-web-security)
|
||
page.setDefaultTimeout(DEFAULT_TIMEOUT);
|
||
|
||
// Optional: emulate a mobile device
|
||
// await page.emulate(puppeteer.devices['iPhone 6']);
|
||
|
||
// Configure light mode/dark mode & accessibility reduced motion preferences
|
||
await page.emulateMediaFeatures([
|
||
{name: 'prefers-color-scheme', value: DEFAULT_COLOR_SCHEME},
|
||
{name: 'prefers-reduced-motion', value: 'reduce'},
|
||
]);
|
||
|
||
// Setup headers & deterministically chose a random referrer based on URL
|
||
const rand_idx = hashCode(await page.url()) % DEFAULT_REFERRERS.length
|
||
await page.setExtraHTTPHeaders({
|
||
...DEFAULT_HEADERS,
|
||
referrer: DEFAULT_REFERRERS[rand_idx],
|
||
})
|
||
|
||
// Setup alert to trigger if site tries to sniff whether we are a bot
|
||
function sniffDetector() {
|
||
const userAgent = window.navigator.userAgent;
|
||
const platform = window.navigator.platform;
|
||
// @ts-ignore
|
||
window.navigator.__defineGetter__('userAgent', function () {
|
||
// @ts-ignore
|
||
window.navigator.sniffed = true;
|
||
return userAgent;
|
||
});
|
||
// @ts-ignore
|
||
window.navigator.__defineGetter__('platform', function () {
|
||
// @ts-ignore
|
||
window.navigator.sniffed = true;
|
||
return platform;
|
||
});
|
||
}
|
||
await page.evaluateOnNewDocument(sniffDetector);
|
||
// @ts-ignore
|
||
const was_sniffed = await page.evaluate(() => (!!window.navigator.sniffed))
|
||
if (was_sniffed) {
|
||
console.warn('[⚠️] Site tried to sniff if we are a bot! Site may be difficult to archive.')
|
||
}
|
||
|
||
return page
|
||
}
|
||
|
||
async function setupModalAutoClosing(page, page_state, {timeout=1_250}={}) {
|
||
page.on('dialog', (dialog) => {
|
||
console.log(`[👆] Auto-closing modal that popped up: ${dialog.message()}...`)
|
||
setTimeout(() => {try { dialog.accept() } catch(err) {}}, timeout);
|
||
})
|
||
|
||
// if you expect a file-upload dialog, use this to catch it instead:
|
||
// const [fileChooser] = await Promise.all([
|
||
// page.waitForFileChooser(),
|
||
// ]);
|
||
// await fileChooser.accept(['/tmp/myfile.pdf']);
|
||
page.on('close', () => {
|
||
try {
|
||
page.off('dialog')
|
||
} catch(err) {}
|
||
})
|
||
}
|
||
|
||
async function startScreenrecording(page, page_state, {duration_limit=60, codec='libx264'}={}) {
|
||
await fs.promises.mkdir(path.dirname(SCREENRECORDING_PATH(page)), {recursive: true})
|
||
// console.log(`[🎬] Starting screen-recording stream...`.padEnd(82), prettyPath(SCREENRECORDING_PATH(page)))
|
||
|
||
// alternative: interact with low-level puppeteer screencast API directly
|
||
// using puppeteer.page.screencast: https://pptr.dev/api/puppeteer.page.screencast
|
||
// const recorder = await page.screencast({path: SCREENRECORDING_PATH(page)});
|
||
|
||
// alternative: use puppeteer-stream for .webm/.mp4 screen recordings with tab audio included
|
||
// works sometimes but has a few issues, e.g.: https://github.com/SamuelScheit/puppeteer-stream/issues/8
|
||
|
||
// alternative: puppeteer-screen-recorder (most compatible/stable but doesn't include tab audio output)
|
||
const recorder = new PuppeteerScreenRecorder(page, {
|
||
followNewTab: false,
|
||
recordDurationLimit: duration_limit,
|
||
// fps: 25,
|
||
// ffmpeg_Path: '<path of ffmpeg_path>' || null,
|
||
// videoFrame: {
|
||
// width: 1024,
|
||
// height: 768,
|
||
// },
|
||
// videoCrf: 18,
|
||
videoCodec: codec,
|
||
// videoPreset: 'ultrafast',
|
||
// videoBitrate: 1000,
|
||
// autopad: {
|
||
// color: 'black' | '#35A5FF',
|
||
// },
|
||
// aspectRatio: '4:3',
|
||
});
|
||
page_state.recorder = recorder
|
||
await recorder.start(SCREENRECORDING_PATH(page))
|
||
|
||
page.on('close', async () => {await saveScreenrecording(page, page_state)});
|
||
return page_state
|
||
}
|
||
|
||
async function startResponseSaving(page, page_state) {
|
||
const dir = RESPONSES_PATH(page)
|
||
await fs.promises.mkdir(dir, {recursive: true})
|
||
|
||
console.log(`[🌄] Starting raw response bytes recording...`.padEnd(82), prettyPath(dir) + '/')
|
||
|
||
// Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other
|
||
const types_to_save = [
|
||
// 'document',
|
||
'script',
|
||
'stylesheet',
|
||
'font',
|
||
'image',
|
||
'media',
|
||
'xhr',
|
||
'websocket',
|
||
]
|
||
|
||
// reset responses index file to empty
|
||
const responses_log_path = path.join(dir, 'index.jsonl')
|
||
await overwriteFile(responses_log_path, '')
|
||
|
||
// add handler to save all image repsonses into output directory
|
||
page.on('response', async (response) => {
|
||
try {
|
||
|
||
const timestamp = versionStrFromDate(new Date(), {withDate: true, withTime: true, withSeconds: true, withMilliseconds: true})
|
||
|
||
if (!page_state.main_response && (response.request().url() == page_state.original_url)) {
|
||
// save first response as main page response (if we havent already caught it earlier)
|
||
page_state.main_response = response
|
||
}
|
||
|
||
const status = response.status()
|
||
if ((status >= 300) && (status < 500)) {
|
||
// console.log('Got bad response from', response.url(), 'to', response.headers()['location'])
|
||
return
|
||
}
|
||
const request = response.request()
|
||
const resourceType = request.resourceType()
|
||
const url_scheme = (response.url() || request.url()).split(':')[0].toLowerCase()
|
||
const method = (url_scheme === 'data') ? 'DATA' : request.method()
|
||
|
||
// console.log(' ', resourceType, response.url())
|
||
if (types_to_save.includes(resourceType)) {
|
||
// create ./responses/xhr/www.facebook.com/static/images/icons/ subdir based on hostname + path
|
||
const resource_type_dir = path.join(dir, resourceType)
|
||
const url = new URL(response.url())
|
||
let subdir = resource_type_dir
|
||
const url_path = (url.pathname || '').slice(0, 250).endsWith('/')
|
||
? (url.pathname || '').slice(0, 250)
|
||
: path.dirname((url.pathname || '').slice(0, 250))
|
||
|
||
// determine subdirectory based on url type (handles http:,https:,file:,data:,chrome-extension:,about:,etc.)
|
||
if (!URL_SCHEMES_IGNORED.includes(url_scheme)) {
|
||
// is a normal http:// or https:// url, use the domain + path to construct subdirectory
|
||
subdir = path.join(resource_type_dir, (url.hostname || 'data').slice(0, 250), url_path)
|
||
} else if (url_scheme == 'data') {
|
||
// is a data:... url, store in ./data subdirectory
|
||
subdir = path.join(resource_type_dir, 'data')
|
||
} else {
|
||
// is a chrome-extension:// or other special url, use the extension id + path to construct subdirectory
|
||
const url_path = path.dirname((url.pathname || '').slice(0, 999))
|
||
subdir = path.join(resource_type_dir, url_scheme, (url.hostname || 'data').slice(0, 250), url_path)
|
||
}
|
||
|
||
// write response to responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json
|
||
let abspath = null
|
||
let resp_mimetype = null
|
||
let extension = ''
|
||
let uniq_filename = null
|
||
let uniq_abspath = null
|
||
let symlink_abspath = null
|
||
let responseSha256 = null
|
||
try {
|
||
await fs.promises.mkdir(path.join(dir, 'all'), {recursive: true})
|
||
try {
|
||
await fs.promises.mkdir(subdir, {recursive: true})
|
||
} catch(err) {
|
||
subdir = subdir + '.dir' // TODO: apply this workaround to parent path entries too
|
||
try {
|
||
await fs.promises.mkdir(subdir, {recursive: true})
|
||
} catch(err) {
|
||
subdir = path.join(resource_type_dir, 'data')
|
||
await fs.promises.mkdir(subdir, {recursive: true})
|
||
}
|
||
}
|
||
;({abspath: symlink_abspath, resp_mimetype, extension} = await detectFilename({page, response, dir: subdir, resourceType}))
|
||
|
||
// responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json
|
||
uniq_filename = `${timestamp}__${method}__` + [encodeURIComponent(url.href).slice(0, 64).replaceAll('/', '_').replace(new RegExp(`.${extension}$`), ''), extension].filter(s => s.length).join('.')
|
||
uniq_abspath = path.join(dir, 'all', uniq_filename)
|
||
|
||
|
||
let bytesBuffer = null
|
||
try {
|
||
bytesBuffer = await response.buffer()
|
||
} catch(err) {
|
||
if (String(err).includes("Cannot read properties of undefined (reading 'body')")) {
|
||
// not sure why it's happening but seems to be too late to caputre body sometimes? possible race condition
|
||
} else {
|
||
console.warn('[⚠️] Failed to save response bytes for:', response.request().url(), err)
|
||
}
|
||
}
|
||
if (bytesBuffer) {
|
||
// write response data into ./all/<TS>__<METHOD>__<URL>.<EXT>
|
||
await overwriteFile(uniq_abspath, bytesBuffer)
|
||
|
||
responseSha256 = crypto.createHash('sha256').update(bytesBuffer).digest('hex')
|
||
|
||
// write symlink file to ./<TYPE>/<DOMAIN>/...<PATH>/<FILENAME>.<EXT> -> ./all/<TS>__<METHOD>__<URL>.<EXT>
|
||
await overwriteSymlink(uniq_abspath, symlink_abspath, {relative: dir, mkdirs: true, search_limit: dir})
|
||
}
|
||
// console.log(' ->', symlink_abspath)
|
||
} catch(err) {
|
||
// dont do anything for redirectresponses, error responses, etc.
|
||
console.warn(err)
|
||
}
|
||
|
||
const urlSha256 = crypto.createHash('sha256').update(String(request.url())).digest('hex')
|
||
// const headersSha256 = crypto.createHash('sha256').update(String(request.headers())) // someday we may want to save headers hashes too
|
||
|
||
const truncated_url = (method == 'DATA') ? request.url().slice(0, 128) : request.url() // don't duplicate bytes in data: urls (we already saved them in the file)
|
||
|
||
// this is essentially replicating the functionality of a WARC file, but in directory + index.jsonl form
|
||
await fs.promises.appendFile(
|
||
responses_log_path,
|
||
JSON.stringify({
|
||
ts: timestamp,
|
||
method,
|
||
url: truncated_url,
|
||
urlSha256,
|
||
postData: request.postData(),
|
||
response_url: ((method != 'DATA') && (url.href != request.url())) ? url.href : undefined,
|
||
status,
|
||
resourceType,
|
||
mimeType: resp_mimetype,
|
||
responseSha256,
|
||
path: uniq_abspath?.replace(dir, '.'),
|
||
symlink_path: symlink_abspath?.replace(dir, '.'),
|
||
extension,
|
||
}) + '\n',
|
||
'utf-8',
|
||
)
|
||
}
|
||
} catch(err) {
|
||
// we should never throw hard errors here because there's nothing above us to catch it
|
||
// and we dont want to crash the entire CDP session / browser / main node process
|
||
console.warn('[❌] Error in response handler (set in startResponseSaving):', err)
|
||
}
|
||
});
|
||
// handled by stopMetadataRecording():
|
||
// page.on('close', () => {
|
||
// page.off('response')
|
||
// })
|
||
}
|
||
|
||
function dedupeCookies(cookies) {
|
||
const len_before = cookies.length
|
||
|
||
const allowed_cookie_attrs = ['domain', 'path', 'name', 'value', 'expires', 'sameSite', 'sourceScheme', 'url', 'priority', 'secure', 'httpOnly']
|
||
|
||
const deduped_cookies = {}
|
||
for (const cookie of cookies) {
|
||
try {
|
||
const unique_id = `${cookie.domain}${cookie.path}${cookie.name}`
|
||
deduped_cookies[unique_id] = {
|
||
...(deduped_cookies[unique_id] || {}),
|
||
...cookie,
|
||
expires: 2147483640, // max allowed expiry time (2038-01-18)
|
||
session: false, // make sure cookies dont expire at browser close time
|
||
secure: false, // make cookie restrictions more lax (for archiving scripts)
|
||
httpOnly: false, // make it easier to tamper with cookies from JS (for archiving scripts)
|
||
|
||
// "path": "/",
|
||
// "expires": 2147483641,
|
||
// "size": 194,
|
||
// "httpOnly": false,
|
||
// "secure": false,
|
||
// "session": false,
|
||
// "priority": "High",
|
||
// "sameParty": false,
|
||
// "sourceScheme": "Secure",
|
||
// "sourcePort": 443
|
||
|
||
// and more... https://pptr.dev/api/puppeteer.cookieparam
|
||
} as Cookie
|
||
|
||
if (!deduped_cookies[unique_id].value) {
|
||
delete deduped_cookies[unique_id]
|
||
continue
|
||
}
|
||
if (deduped_cookies[unique_id].name.startsWith('__')) {
|
||
// cookies that start with __ must be secure, see https://github.com/puppeteer/puppeteer/issues/6806
|
||
deduped_cookies[unique_id].secure = true
|
||
deduped_cookies[unique_id].sourceScheme = 'Secure'
|
||
}
|
||
if (deduped_cookies[unique_id].domain.startsWith('.')) {
|
||
deduped_cookies[unique_id].sameParty = false
|
||
deduped_cookies[unique_id].domain = deduped_cookies[unique_id].domain.slice(1)
|
||
}
|
||
|
||
for (const key of Object.keys(deduped_cookies[unique_id])) {
|
||
if (!allowed_cookie_attrs.includes(key)) {
|
||
delete deduped_cookies[unique_id][key]
|
||
}
|
||
}
|
||
} catch(err) {
|
||
console.error('[❌] Failed to parse cookie during deduping', cookie)
|
||
throw err
|
||
}
|
||
}
|
||
// console.log(`[🍪] Deduped ${len_before} cookies to ${Object.keys(deduped_cookies).length}...`)
|
||
|
||
return Object.values(deduped_cookies) as Cookie[]
|
||
}
|
||
|
||
async function loadCookiesTxt() {
|
||
const cookies = [] as Cookie[]
|
||
return cookies // write-only from chrome -> files for now
|
||
|
||
if (fs.existsSync(COOKIES_TXT_PATH)) {
|
||
// console.log(`[🍪] Loading cookies/localStorage/sessionStorage from ${COOKIES_TXT_PATH}...`)
|
||
|
||
// Read from to cookies.txt file using tough-cookie + @root/file-cookie-store
|
||
const cookies_store = new FileCookieStore(COOKIES_TXT_PATH, {auto_sync: false, lockfile: false});
|
||
cookies_store.getAllCookiesAsync = util.promisify(cookies_store.getAllCookies);
|
||
const exported_cookies = await cookies_store.getAllCookiesAsync()
|
||
for (const cookie of exported_cookies) {
|
||
const cookie_from_tough = cookie.toJSON()
|
||
const domain = cookie_from_tough.hostOnly ? `.${cookie_from_tough.domain}` : cookie_from_tough.domain
|
||
const cookie_for_puppeteer: Cookie = {
|
||
domain,
|
||
name: cookie_from_tough.key,
|
||
path: cookie_from_tough.path,
|
||
value: cookie_from_tough.value,
|
||
secure: cookie_from_tough.secure || false,
|
||
httpOnly: cookie_from_tough.httpOnly || false,
|
||
session: false,
|
||
expires: (new Date(cookie_from_tough.expires)).valueOf()/1000,
|
||
size: undefined,
|
||
}
|
||
// console.log('COOKIE_FROM_TOUGH_TXT', cookie_from_tough, cookie_for_puppeteer)
|
||
cookies.push(cookie_for_puppeteer)
|
||
}
|
||
}
|
||
}
|
||
|
||
type AuthJSON = {
|
||
cookies: Cookie[],
|
||
sessionStorage: any,
|
||
localStorage: any,
|
||
}
|
||
|
||
async function loadAuthStorage(page, {client}, {apply=true}={}) {
|
||
var {
|
||
cookies,
|
||
sessionStorage,
|
||
localStorage,
|
||
}: AuthJSON = {cookies: [], sessionStorage: {}, localStorage: {}}
|
||
|
||
if (!LOAD_AUTH_STORAGE) {
|
||
// dont read auth from filesystem auth.json/cookies.txt, just rely on existing cookies in chrome profile
|
||
return {cookies, sessionStorage, localStorage}
|
||
}
|
||
|
||
if (fs.existsSync(COOKIES_TXT_PATH)) {
|
||
try {
|
||
cookies = await loadCookiesTxt()
|
||
} catch(err) {
|
||
console.warn('[⚠️] Loaded invalid cookies.txt, moved it to cookies.txt.corrupted (did two processes try to change it at the same time?)')
|
||
await fs.promises.rename(COOKIES_TXT_PATH, COOKIES_TXT_PATH + '.corrupted')
|
||
}
|
||
// console.log(`[🍪] Loading cookies from cookies.txt...`, cookies.length)
|
||
}
|
||
|
||
if (fs.existsSync(AUTH_JSON_PATH)) {
|
||
try {
|
||
var {
|
||
cookies: auth_json_cookies,
|
||
sessionStorage,
|
||
localStorage,
|
||
} = JSON.parse(await fs.promises.readFile(AUTH_JSON_PATH, 'utf-8'));
|
||
cookies = [...cookies, ...auth_json_cookies]
|
||
// console.log(`[🍪] Loading cookies from auth.json...`, auth_json_cookies.length)
|
||
} catch(err) {
|
||
console.warn('[⚠️] Loaded invalid auth.json, moved it to auth.json.corrupted (did two processes try to change it at the same time?)')
|
||
await fs.promises.rename(AUTH_JSON_PATH, AUTH_JSON_PATH + '.corrupted')
|
||
}
|
||
}
|
||
|
||
cookies = dedupeCookies(cookies)
|
||
|
||
if (apply) {
|
||
console.log(`[🍪] Loading stored cookies/localStorage/sessionStorage into session...`, cookies.length)
|
||
|
||
// if (cookies?.length) {
|
||
// try {
|
||
// // try setting all at once first (much faster)
|
||
// await page.setCookie(...cookies)
|
||
// } catch(err) {
|
||
// // if any errors, fall back to setting one-by-one so that individual error can be caught
|
||
// for (const cookie of cookies) {
|
||
// try {
|
||
// await page.setCookie(cookie);
|
||
// } catch(err) {
|
||
// console.error('[❌] Failed to set cookie', cookie)
|
||
// throw err
|
||
// }
|
||
// }
|
||
// }
|
||
// }
|
||
const origin = await page.evaluate(() => window.location.origin)
|
||
|
||
await page.evaluate((savedSessionStorage) => {
|
||
for (const [key, value] of Object.entries(savedSessionStorage)) {
|
||
sessionStorage[key] = value;
|
||
}
|
||
}, sessionStorage[origin] || {});
|
||
|
||
await page.evaluate((savedLocalStorage) => {
|
||
for (const [key, value] of Object.entries(savedLocalStorage)) {
|
||
localStorage[key] = value;
|
||
}
|
||
}, localStorage[origin] || {});
|
||
|
||
// origin/auth context changes when we do page.goto so we have to hook pageload and apply it then as well
|
||
// https://stackoverflow.com/questions/51789038/set-localstorage-items-before-page-loads-in-puppeteer
|
||
await page.evaluateOnNewDocument(({sessionStorage, localStorage}) => {
|
||
const origin = window.location.origin;
|
||
|
||
for (const [key, value] of Object.entries(sessionStorage[origin] || {})) {
|
||
window.sessionStorage.setItem(key, value as string)
|
||
}
|
||
for (const [key, value] of Object.entries(localStorage[origin] || {})) {
|
||
window.localStorage.setItem(key, value as string)
|
||
}
|
||
|
||
}, {sessionStorage, localStorage});
|
||
}
|
||
|
||
return {cookies, sessionStorage, localStorage}
|
||
}
|
||
|
||
async function loadCloudflareCookie(page, {original_url}, {timeout=20_000}={}) {
|
||
// make request to FlareSolverr server to get magic cookies that let us bypass cloudflare bot detection
|
||
// docker run -p 8191:8191 -e LOG_LEVEL=info ghcr.io/flaresolverr/flaresolverr
|
||
|
||
|
||
// alternatives if this stops working:
|
||
// - https://github.com/omkarcloud/botasaurus
|
||
// - https://github.com/ultrafunkamsterdam/nodriver
|
||
// - https://github.com/Akmal-CloudFreed/CloudFreed-CloudFlare-bypass
|
||
// - https://github.com/VeNoMouS/cloudscraper
|
||
|
||
const query = { url: original_url, cmd: "request.get", maxTimeout: timeout }
|
||
try {
|
||
const response = await fetch(FLARESOLVERR_API_ENDPOINT, {
|
||
method: 'POST',
|
||
headers: {'Content-Type': 'application/json'},
|
||
body: JSON.stringify(query),
|
||
});
|
||
const data = await response.json();
|
||
|
||
const new_cookies = (data?.solution?.cookies || []).map(cookie => ({
|
||
...cookie,
|
||
'expires': 2147483640, // overwrite expiration to 32bit maximum timestamp (2038-01-18)
|
||
'secure': false, // cookie value is plain text (not encrypted/encoded)
|
||
}))
|
||
|
||
if (new_cookies.length) {
|
||
console.log(`[☑️] Got Cloudflare bypass cookies (${new_cookies.length}) from FlareSolverr API...`)
|
||
await page.setCookie(...new_cookies);
|
||
return new_cookies
|
||
} else {
|
||
const error_str = JSON.stringify(data?.message || data, null, 4)
|
||
throw `Bad FlareSolverr Response: ${error_str}`
|
||
}
|
||
|
||
} catch (error) {
|
||
if (JSON.stringify(error).includes('Challenge not detected')) {
|
||
console.log('[☑️] Page is accessible without FlareSolverr Cloudflare bypass.')
|
||
} else {
|
||
console.warn('[❌] Failed to get Cloudflare bypass cookies from FlareSolverr API.', error)
|
||
}
|
||
}
|
||
return []
|
||
}
|
||
|
||
async function setupURLRewriting(page, page_state) {
|
||
await page.setRequestInterception(true);
|
||
|
||
const rewrites = URL_REWRITES.sort((a, b) => (a.idx || 0) - (b.idx || 0))
|
||
|
||
page.on('request', interceptedRequest => {
|
||
if (interceptedRequest.isInterceptResolutionHandled()) return;
|
||
|
||
const original_url = interceptedRequest.url()
|
||
|
||
// apply all the rewrites in order to the request URL
|
||
let url = original_url
|
||
for (const rewrite of rewrites) {
|
||
const new_url = url.replace(rewrite.pattern, rewrite.replacement)
|
||
// console.log(rewrite, url, new_url)
|
||
|
||
// if url is rewritten to an emptystring, abort the request
|
||
if (!new_url) {
|
||
console.warn('[🟥] Request blocked', rewrite.pattern, ':', url)
|
||
interceptedRequest.abort()
|
||
return
|
||
}
|
||
else if (new_url && new_url != url) {
|
||
// console.warn('[📳] Request rewritten', rewrite.pattern, rewrite.replacement, ':', url, '->', new_url)
|
||
console.warn('[📳] Request rewritten', rewrite.pattern, ':', new_url)
|
||
url = new_url
|
||
}
|
||
}
|
||
|
||
if (url == original_url) {
|
||
// if url is unchanged, continue request flow as-is
|
||
interceptedRequest.continue()
|
||
} else {
|
||
// otherwise redirect the browser to our rewritten version
|
||
interceptedRequest.respond({
|
||
status: 302,
|
||
headers: {
|
||
location: url,
|
||
'x-redirect-by': 'ArchiveBox.setupURLRewriting',
|
||
},
|
||
})
|
||
}
|
||
});
|
||
// handled by stopMetadataRecording():
|
||
// page.on('close', () => {
|
||
// page.off('request')
|
||
// page.setRequestInterception(false)
|
||
// })
|
||
}
|
||
|
||
async function startMetadataRecording(page, {original_url, version, client, traffic_log, console_log, redirects}) {
|
||
// update helper state on page
|
||
page._original_url = (original_url || (await page.url())).toString()
|
||
|
||
// DEBUGGING: helpers for repl() debugging, dont rely on these (global state is badd mmkay)
|
||
// page._client = client || page._client || await page.target().createCDPSession()
|
||
// page._redirects = redirects
|
||
// page._traffic_log = traffic_log
|
||
|
||
// add initial entry to page redirect log
|
||
redirects[original_url] = {
|
||
idx: 0,
|
||
url: original_url,
|
||
src: null,
|
||
type: 'Initial',
|
||
wallTime: Date.now()/1000,
|
||
frameId: page.mainFrame()._id,
|
||
requestId: null,
|
||
initiator: {type: "user"},
|
||
isMainFrame: true,
|
||
}
|
||
|
||
// DEBUGGING: record optional chrome debug trace with screenshots (heavy)
|
||
// try {
|
||
// await page.tracing.stop()
|
||
// await wait(200)
|
||
// } catch(err) {}
|
||
// try {
|
||
// await page.tracing.start({path: TRACE_PATH(page), screenshots: true});
|
||
// } catch(err) {}
|
||
|
||
let last_main_frame_url = original_url
|
||
|
||
// setup network request intercepts handler
|
||
const addCDPRequestDataListener = (eventName) => {
|
||
client.on(eventName, event => {
|
||
try {
|
||
// save any HTTP/JS redirects to redirects for saveRedirects(page) to use later on
|
||
const new_url = event.documentURL
|
||
const http_status = event.redirectResponse?.status || 0
|
||
const is_new_url = (new_url !== original_url) && !redirects[new_url]
|
||
const is_main_frame_navigation = (event.frameId == page.mainFrame()._id)
|
||
const is_http_redirect = (300 < http_status) && (http_status < 400)
|
||
|
||
if (new_url && is_new_url && (is_main_frame_navigation || is_http_redirect) && event.type == 'Document') {
|
||
const new_redirect_entry = {
|
||
url: new_url,
|
||
src: event.redirectResponse?.url || last_main_frame_url,
|
||
type: http_status || 'JS',
|
||
wallTime: Date.now()/1000,
|
||
frameId: event.frameId,
|
||
requestId: event.requestId,
|
||
initiator: event.initiator,
|
||
idx: Object.keys(redirects).length,
|
||
isMainFrame: is_main_frame_navigation,
|
||
}
|
||
redirects[new_url] = new_redirect_entry
|
||
if (is_main_frame_navigation) {
|
||
ALREADY_ARCHIVED.add(new_redirect_entry.url.slice(0, 4096)) // we're already archiving this tab as it redirects, dont create a duplicate archive for the destination
|
||
console.warn(`[➡️] NAVIGATION[${new_redirect_entry.type}]${ANSI.blue} ${last_main_frame_url} ${ANSI.reset}\n ->${ANSI.blue} ${new_redirect_entry.url} ${ANSI.reset}`)
|
||
last_main_frame_url = new_url
|
||
}
|
||
}
|
||
|
||
if (event.loaderId) {
|
||
traffic_log[event.loaderId] = traffic_log[event.loaderId] || {} // make sure loader is also in requests list first
|
||
// sometimes it's not in the list if we start archiving too late / after a page's initial request was already made
|
||
}
|
||
|
||
// save to traffic_log as {8BC2087A2CCEF28017099C0E10E87440: {Network.eventWillBeSent: {eventId,loaderId, request|response, ...}}
|
||
// https://stackoverflow.com/questions/47078655/missing-request-headers-in-puppeteer?noredirect=1&lq=1
|
||
traffic_log[event.requestId] = traffic_log[event.requestId] || {}
|
||
Object.assign(traffic_log[event.requestId], { [eventName]: event })
|
||
|
||
// DEBUGGING: log page visits and navigation events to console
|
||
// if (event?.response?.status) {
|
||
// // if we're expecting an HTML response, then we assume it's a page visit & log it to console
|
||
// const acceptMimeType = traffic_log[event.requestId]['Network.requestWillBeSentExtraInfo']?.headers?.accept
|
||
// if (acceptMimeType && acceptMimeType.includes('text/html')) {
|
||
// // log any HTML page responses (less noisy)
|
||
// console.log(`[>] GOT ${event.documentURL}: ${event.response.status} ${event.response.url} (${event.response.mimeType})`)
|
||
// } else {
|
||
// // log ALL responses, inclusing JS,CSS,Images,etc. (very noisy)
|
||
// // console.log(` > ${event.response.status} ${event.response.url} (${event.response.mimeType})`)
|
||
// }
|
||
// }
|
||
} catch(err) {
|
||
console.warn('[X] Error during request/response handler (startMetadataRecording.addCDPRequestDataListener)')
|
||
console.warn(err)
|
||
}
|
||
})
|
||
}
|
||
addCDPRequestDataListener('Network.requestWillBeSent')
|
||
addCDPRequestDataListener('Network.requestWillBeSentExtraInfo')
|
||
addCDPRequestDataListener('Network.responseReceived')
|
||
addCDPRequestDataListener('Network.responseReceivedExtraInfo')
|
||
|
||
// clear any existing log entries
|
||
const consolelog_info = {
|
||
TYPE: 'console',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
}
|
||
await overwriteFile(CONSOLELOG_PATH(page), JSON.stringify(consolelog_info) + '\n')
|
||
|
||
// record console logs from page
|
||
const appendConsoleLog = async (line) => {
|
||
if (!line) return
|
||
console_log.push(line)
|
||
await fs.promises.appendFile(
|
||
CONSOLELOG_PATH(page),
|
||
line + '\n',
|
||
'utf-8',
|
||
)
|
||
}
|
||
|
||
page.on('console', async(message) =>
|
||
await appendConsoleLog(`${message.type().toUpperCase()} ${message.location()} ${JSON.stringify(message.text())}`))
|
||
page.on('pageerror', async (error) =>
|
||
await appendConsoleLog(error.message || JSON.stringify(error)))
|
||
page.on('requestfailed', async (request) =>
|
||
await appendConsoleLog(`${request.failure()?.errorText} ${request.url() || JSON.stringify(request)}`))
|
||
|
||
// set puppeteer options on page
|
||
await client.send('Network.enable') // enable network tampering API
|
||
await client.send('Emulation.clearDeviceMetricsOverride'); // clear timing statistics
|
||
await client.send('Page.setDownloadBehavior', {
|
||
behavior: 'allow',
|
||
downloadPath: CHROME_DOWNLOADS_DIR,
|
||
})
|
||
|
||
// handled by stopMetadataRecording():
|
||
// page.on('close', () => {
|
||
// try {
|
||
// page.off('request')
|
||
// page.off('console')
|
||
// page.off('pageerror')
|
||
// page.off('requestfailed')
|
||
// page.setRequestInterception(false)
|
||
// } catch(err) {
|
||
// // some versions of puppeteer have had race conditions here where page is already closed by now
|
||
// console.warn('[X] Error in page close handler', err)
|
||
// }
|
||
// })
|
||
|
||
return {original_url, client, redirects, traffic_log, console_log}
|
||
}
|
||
|
||
async function stopMetadataRecording(page, _page_state) {
|
||
console.log('[🪝] Stopping CDP event hooks and request interception...')
|
||
try {
|
||
page.off('request')
|
||
page.off('response')
|
||
page.off('console')
|
||
page.off('pageerror')
|
||
page.off('requestfailed')
|
||
page.off('hashchange')
|
||
page.setRequestInterception(false)
|
||
// page.tracing.stop()
|
||
} catch(err) {
|
||
// some versions of puppeteer have had race conditions here where page is already closed by now
|
||
console.warn('[X] Error in page close handler', err)
|
||
}
|
||
}
|
||
|
||
/********************** Human Behavior Emulation ******************************/
|
||
|
||
async function solveCaptchas(page, page_state, {timeout=90_000}={}) {
|
||
|
||
// using puppeteer-extra-plugin-recaptcha auto-solver
|
||
// await page.solveRecaptchas()
|
||
|
||
// using 2captcha-solver extension auto-solver
|
||
try {
|
||
// console.log('[🕑] Waiting for CAPTCHA to appear...')
|
||
await page.waitForSelector('.captcha-solver', {timeout: 5_000})
|
||
|
||
console.log('[🤖] CAPTCHA challenge found, submitting to 2Captcha for solving...')
|
||
await page.click('.captcha-solver')
|
||
|
||
console.log(`[🧠] Waiting up to ${timeout/1000}s for CAPTCHA to be solved...`)
|
||
await page.waitForSelector(`.captcha-solver[data-state="solved"]`, {timeout})
|
||
|
||
console.log('[🔓] CAPTCHA solution retrieved from 2captcha.')
|
||
} catch(err) {
|
||
console.log('[☑️] No CATPCHA challenges found, site thinks we are human.')
|
||
}
|
||
}
|
||
|
||
async function jiggleMouse(page, page_state, {timeout=600}={}) {
|
||
console.log(`[🐁] Moving mouse around randomly for ${timeout/1000}s...`)
|
||
|
||
const randomPoint = await getRandomPagePoint(page)
|
||
const cursor = createCursor(page, randomPoint, true)
|
||
|
||
cursor.toggleRandomMove(true)
|
||
await wait(timeout/2);
|
||
await cursor.moveTo({x: DEFAULT_VIEWPORT.width/2, y: DEFAULT_VIEWPORT.height/2});
|
||
await wait(timeout/2);
|
||
cursor.toggleRandomMove(false)
|
||
}
|
||
|
||
async function blockRedirects(page, {original_url}) {
|
||
page.on('request', req => {
|
||
if (req.isInterceptResolutionHandled()) return;
|
||
|
||
// if it's a top-level navigation event to a new url
|
||
if (req.isNavigationRequest() && req.frame() === page.mainFrame() && req.url() !== original_url) {
|
||
req.abort('aborted');
|
||
console.warn('[🟥] Blocked page attempt to naviage to new URL', req.url())
|
||
} else {
|
||
req.continue();
|
||
}
|
||
});
|
||
// handled by stopMetadataRecording():
|
||
// page.on('close', () => {
|
||
// page.off('request')
|
||
// page.setRequestInterception(false)
|
||
// })
|
||
await page.setRequestInterception(true);
|
||
}
|
||
|
||
async function blockJSExecution(page, _page_state) {
|
||
console.warn('[🟥] Stopping all JS execution on page...')
|
||
await page.evaluate(() => {
|
||
debugger;
|
||
})
|
||
// OR alternatively this (more buggy, breaks many sites):
|
||
// const html = await page.content();
|
||
// page.setJavaScriptEnabled(false);
|
||
// await page.setContent(html, { waitUntil: 'networkidle0' }); // 4
|
||
}
|
||
|
||
async function scrollDown(page, _page_state, {timeout=120_000, scroll_delay=SCROLL_DELAY, scroll_distance=SCROLL_DISTANCE, scroll_limit=SCROLL_LIMIT}={}) {
|
||
const starting_height = await page.evaluate('document.body.scrollHeight');
|
||
let last_height = starting_height
|
||
|
||
let scroll_count = 0;
|
||
let scroll_position = scroll_count * scroll_distance
|
||
// await page.bringToFront()
|
||
|
||
// scroll to top
|
||
await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
|
||
|
||
while ((scroll_count < scroll_limit) && ((scroll_delay * scroll_count) < timeout)) {
|
||
console.log(`[⬇️] Scrolling down ${scroll_count}x 1000px... (${scroll_position}/${last_height})`)
|
||
await page.evaluate((y_offset) => { window.scrollTo({ top: y_offset, left: 0, behavior: 'smooth' }); }, scroll_position);
|
||
scroll_count++
|
||
scroll_position = scroll_count * scroll_distance
|
||
|
||
// check if any new content was added / if we are infiniscrolling
|
||
let new_height = await page.evaluate('document.body.scrollHeight')
|
||
const added_px = new_height - last_height
|
||
if (added_px > 0) {
|
||
console.log('[✚] Detected infini-scrolling...', `${last_height}+${added_px} => ${new_height}`)
|
||
} else if (scroll_position >= new_height + scroll_distance) {
|
||
// we've reached the bottom, condition isn't true until we've tried to go n+1 past the end (which is fine)
|
||
if (scroll_count > 2)
|
||
break
|
||
}
|
||
last_height = new_height
|
||
|
||
// sleep 2s, perform the smooth scroll down by 1000px, and increment the counter
|
||
await wait(scroll_delay);
|
||
|
||
// facebook watch pages infiniscroll (more and more recommendations forever), stop them after 3 pages
|
||
if (page._original_url.startsWith('https://www.facebook.com/watch/?v') && scroll_count > 3) break
|
||
}
|
||
|
||
// scroll to bottom
|
||
if (scroll_position < last_height) {
|
||
await page.evaluate(() => { window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); });
|
||
await wait(scroll_delay)
|
||
await page.evaluate(() => { window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); });
|
||
}
|
||
|
||
// Always wait an additional 2sec at the end for scroll animations / loading / rendering to settle down
|
||
console.log('[📉] Reached bottom of the page.', `(${scroll_position}/${last_height})`)
|
||
await wait(scroll_delay);
|
||
await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
|
||
await wait(scroll_delay);
|
||
|
||
return last_height
|
||
}
|
||
|
||
async function disableAnimations(page, _page_state) {
|
||
console.log(`[⛄️] Disabling all animations using CSS override...`)
|
||
|
||
// https://stackoverflow.com/questions/53167644/injecting-css-into-site-with-puppeteer
|
||
const css_override = `*, *::before, *::after {
|
||
-moz-animation: none !important;
|
||
-moz-transition: none !important;
|
||
animation: none !important;
|
||
transition: none !important;
|
||
caret-color: transparent !important;
|
||
}`
|
||
|
||
// inject override into current page
|
||
await page.addStyleTag({content: css_override});
|
||
|
||
// inject override into any subsequently navigated pages
|
||
await page.evaluateOnNewDocument((css_override) => {
|
||
const style_tag = document.createElement('style')
|
||
style_tag.type = 'text/css'
|
||
style_tag.innerHTML = css_override
|
||
document.getElementsByTagName('head')[0].appendChild(style_tag)
|
||
}, css_override);
|
||
}
|
||
|
||
async function expandComments(page, _page_state, {timeout=120_000, limit=15_000, delay=650}={}) {
|
||
console.log(`[🗃️] Expanding up to ${limit} comments every ${delay}ms...`)
|
||
|
||
// expand all <details> sections in Github READMEs, HedgeDoc pages, etc.
|
||
await page.$$eval('pierce/article details', elem => {elem.open = true}) // expand Github README details sections
|
||
await page.$$eval('pierce/div.js-discussion details:not(.details-overlay)', elem => {elem.open = true}) // expand Github issue discussion hidden comments
|
||
await page.$$eval('pierce/.markdown-body details', elem => {elem.open = true}) // expand HedgeDoc Markdown details sections
|
||
|
||
await page.exposeFunction('onHashChange', url => page.emit('hashchange', url));
|
||
await page.evaluateOnNewDocument(() => {
|
||
// @ts-ignore
|
||
addEventListener('hashchange', (e) => onHashChange(location.href));
|
||
});
|
||
|
||
// Listen for hashchange events in node Puppeteer code.
|
||
page.on('hashchange', url => console.log('Page tried to navigate to:', new URL(url)));
|
||
|
||
|
||
const num_expanded = await page.evaluate(async ({timeout, limit, delay}) => {
|
||
function getElementsByXPath(xpath, ctx?) {
|
||
var results = [];
|
||
var xpathResult = document.evaluate(
|
||
xpath, // e.g. //*[text()='"+text+"']
|
||
ctx || document,
|
||
null,
|
||
XPathResult.ORDERED_NODE_ITERATOR_TYPE,
|
||
null
|
||
);
|
||
var node;
|
||
while ((node = xpathResult.iterateNext()) != null) {
|
||
results.push(node);
|
||
}
|
||
return results;
|
||
}
|
||
|
||
let num_expanded = 0
|
||
const getLoadMoreLinks = () => [
|
||
// find all the buttons/links to expand collapsed/hidden/lazy-loaded content
|
||
...document.querySelectorAll('faceplate-partial[loading=action]'), // new reddit
|
||
...document.querySelectorAll('a[onclick^="return morechildren"]'), // old reddit show more replies
|
||
...document.querySelectorAll('a[onclick^="return togglecomment"]'), // old reddit show hidden replies
|
||
// ...document.querySelectorAll('a.js-show-link'), // stack overflow comments show more (TODO: make this only work on SO)
|
||
// ...document.querySelectorAll('a.morelink'), // HackerNews profile show more (TODO: make this only work on HN)
|
||
// ...getElementsByXPath("//*[text()~='View \d+ replies']"), // facebook comment expander
|
||
...getElementsByXPath("//*[text()='Show more replies']"), // twitter infiniscroll expander
|
||
...getElementsByXPath("//*[text()='Show replies']"), // twitter replies expander
|
||
]
|
||
const wait = (ms) => new Promise(res => setTimeout(res, ms))
|
||
|
||
let load_more_links = getLoadMoreLinks()
|
||
while (load_more_links.length) {
|
||
console.log('Expanding comments...', load_more_links.length)
|
||
for (const link of load_more_links) {
|
||
link.scrollIntoView({behavior: 'smooth'})
|
||
if (link.slot == 'children') {
|
||
continue
|
||
// patch new reddit "More replies" links that would open in a new window to display inline instead
|
||
// const comment_id = link.src.split('?')[0].split('/').at(-1)
|
||
// link.slot = `children-${comment_id}-0`
|
||
// link.__alwaysShowSlot = false
|
||
}
|
||
// click the "More replies" button
|
||
link.click()
|
||
num_expanded++
|
||
await wait(delay)
|
||
const time_elapsed = num_expanded * delay
|
||
if ((num_expanded > limit) || (time_elapsed > timeout))
|
||
return num_expanded
|
||
}
|
||
load_more_links = getLoadMoreLinks()
|
||
}
|
||
return num_expanded
|
||
}, {timeout, limit, delay});
|
||
|
||
page.off('hashchange')
|
||
|
||
if (num_expanded) {
|
||
console.log(`[🗃️] Expanded ${num_expanded} comments...`)
|
||
|
||
// scroll to bottom, then back up to top
|
||
const final_height = await page.evaluate('document.body.scrollHeight');
|
||
await page.evaluate((top) => { window.scrollTo({ top, left: 0, behavior: 'smooth' }); }, final_height + 1000);
|
||
await wait(delay);
|
||
await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
|
||
await wait(delay);
|
||
}
|
||
|
||
}
|
||
|
||
async function submitForm(page, _page_state, {timeout=5_000}={}) {
|
||
try {
|
||
await page.waitForSelector('form button[type=submit]', {timeout: 1_500});
|
||
console.log('[☑️] Submitting form...')
|
||
await page.click('form button[type=submit]')
|
||
await page.waitForNavigation({timeout});
|
||
await page.goBack();
|
||
} catch (err) {
|
||
// no form found
|
||
}
|
||
}
|
||
|
||
// TODO: add an evasion to set navigator.connection.rtt = 365 (0 = detectable as headless)
|
||
|
||
/******************************************************************************/
|
||
/******************************************************************************/
|
||
|
||
/**************** Extension-Based Archive Output Tasks ************************/
|
||
|
||
async function saveSinglefile(page, {main_response, extensions}) {
|
||
const extension = extensions.filter(({name}) => name === 'singlefile')[0]
|
||
if (!extension.version) throw 'Could not find Singlefile extension ID, is it installed?'
|
||
|
||
const url = await page.url() || main_response.url()
|
||
if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
|
||
|
||
// get list of existing past files in downloads/* to ignore
|
||
const files_before = new Set(
|
||
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||
.filter(fn => fn.endsWith('.html'))
|
||
);
|
||
|
||
const out_path = SINGLEFILE_PATH(page)
|
||
|
||
console.log(`[🛠️] Saving Singlefile HTML using extension (${extension.id})...`.padEnd(82+1), prettyPath(CHROME_DOWNLOADS_DIR))
|
||
await page.bringToFront() // action button acts on the foreground tab, so it has to be in front :(
|
||
await extension.dispatchAction()
|
||
let files_new = []
|
||
|
||
const check_delay = 3_000
|
||
for (const _try in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) {
|
||
await wait(check_delay)
|
||
|
||
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)).filter(fn => fn.endsWith('.html'));
|
||
files_new = files_after.filter(file => !files_before.has(file))
|
||
|
||
if (files_new.length == 0) {
|
||
// console.warn(` ...waiting for Singlefile to write HTML into ${CHROME_DOWNLOADS_DIR}...`)
|
||
continue
|
||
}
|
||
// iterate through new downloads and find a matching .html containing our page's URL in the header
|
||
for (const file of files_new) {
|
||
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file)
|
||
const dl_text = await fs.promises.readFile(dl_path, 'utf-8')
|
||
const dl_header = dl_text.split('meta charset')[0]
|
||
if (dl_header.includes(`url: ${url}`)) {
|
||
/// dont need this check anymore as now all output is versioned:
|
||
// if (fs.existsSync(out_path)) {
|
||
// const {size: existingSize} = await fs.promises.stat(out_path)
|
||
// const {size: newFileSize} = await fs.promises.stat(dl_path)
|
||
// if (newFileSize < existingSize) {
|
||
// console.log(`[🗑️] Discarding singlefile output (${file}) as it's smaller than existing ${out_path}...`)
|
||
// await fs.promises.rm(dl_path)
|
||
// return out_path
|
||
// }
|
||
// }
|
||
console.log(`[✍️] Moving Singlefile download from ${file}...`.padEnd(82), prettyPath(out_path))
|
||
await fs.promises.rename(dl_path, out_path)
|
||
return out_path
|
||
}
|
||
}
|
||
}
|
||
|
||
console.warn(`[❌] Couldn't find matching Singlefile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay*10)/1000}s:`, files_new.join(', '))
|
||
return null
|
||
}
|
||
|
||
async function saveArchiveWebPage(page, {extensions}, {timeout=30_000}={}) {
|
||
// TODO: waiting on them to expose commands so we can generate .wacz easily
|
||
// https://github.com/webrecorder/archiveweb.page/issues/207
|
||
// ...
|
||
const browser = await page.browser()
|
||
const extension = extensions.filter(({name}) => name === 'archivewebpage')[0]
|
||
await page.bringToFront()
|
||
await extension.dispatchPopup()
|
||
await extension.dispatchAction()
|
||
const popup = await browser.waitForTarget(
|
||
target => target.url().toString().startsWith(`chrome-extension://${extension.id}/popup.html`),
|
||
{timeout: 5_000},
|
||
)
|
||
await page.bringToFront()
|
||
|
||
// await puppeteer.Locator.race([
|
||
// popup.locator('::-p-aria(Start With Autopilot)'),
|
||
// popup.locator('wr-popup-viewer >>>> input'),
|
||
// popup.locator(':scope >>> input')
|
||
// ])
|
||
// .setTimeout(timeout)
|
||
// .click({
|
||
// offset: {
|
||
// x: 7.7265625,
|
||
// y: 7.203125,
|
||
// },
|
||
// });
|
||
|
||
// @ts-ignore
|
||
await puppeteer.Locator.race([
|
||
popup.locator('wr-popup-viewer >>>> div.status-row > p'),
|
||
popup.locator(':scope >>> div.status-row > p'),
|
||
popup.locator('::-p-text(Recording: \n)')
|
||
]).setTimeout(timeout).click({
|
||
delay: 733.3000000007451,
|
||
offset: {
|
||
x: 293,
|
||
y: 13.5,
|
||
},
|
||
})
|
||
|
||
await wait(8_000)
|
||
|
||
// @ts-ignore
|
||
await puppeteer.Locator.race([
|
||
popup.locator('wr-popup-viewer >>>> div:nth-of-type(2) > button > span:nth-of-type(2)'),
|
||
popup.locator(':scope >>> div:nth-of-type(2) > button > span:nth-of-type(2)'),
|
||
popup.locator('::-p-text(Stop)')
|
||
]).setTimeout(timeout).click({
|
||
offset: {
|
||
x: 7.859375,
|
||
y: 23.203125,
|
||
},
|
||
});
|
||
|
||
return null
|
||
}
|
||
|
||
async function savePocket(page, {extensions}) {
|
||
const browser = await page.browser()
|
||
const extension = extensions.filter(({name}) => name === 'pocket')[0]
|
||
if (!extension.version) throw 'Could not find Pocket extension ID, is it installed?'
|
||
|
||
console.log(`[🛠️] Saving URL to Pocket API using extension (${extension.id})...`, 'https://getpocket.com/saves')
|
||
await page.bringToFront() // action button acts on the foreground tab, so it has to be in front
|
||
await extension.dispatchAction()
|
||
try {
|
||
const login_window = await browser.waitForTarget(
|
||
target => target.url().toString().startsWith('https://getpocket.com/'),
|
||
{timeout: 3_000},
|
||
)
|
||
// login window will open if pocket is not signed-in
|
||
if (login_window) return false
|
||
} catch(e) {
|
||
// no new window should open if it saves correctly
|
||
return true
|
||
}
|
||
}
|
||
|
||
/***************** Synchronous Archive Output Tasks ***************************/
|
||
|
||
async function saveScreenrecording(page, page_state, {save_gif=true}={}) {
|
||
if (page_state.recorder) {
|
||
const duration = Date.now() - page_state.start_ts
|
||
console.log(`[🎥] Saving screen-recording video (${duration/1000}s)...`.padEnd(82), prettyPath(SCREENRECORDING_PATH(page)))
|
||
const recorder = page_state.recorder
|
||
page_state.recorder = null
|
||
await recorder.stop()
|
||
|
||
// create symlink for legacy path
|
||
const snap_dir = page_state.snapshot_dir
|
||
const legacy_path = path.join(snap_dir, 'media', 'screenrecording.mp4')
|
||
await overwriteSymlink(SCREENRECORDING_PATH(page), legacy_path, {relative: snap_dir, search_limit: snap_dir})
|
||
|
||
// // remove duplicate frames (white frames at start while it loads + static image at end)
|
||
// const video_path = SCREENRECORDING_PATH(page)
|
||
// const short_path = video_path.replace('.mp4', '.short.mp4')
|
||
// try {
|
||
// await exec(
|
||
// // create a shortened video starting from 0:02s to 0:01s with duplicate frames removed (can look jumpy sometimes)
|
||
// `ffmpeg -ss 2 -sseof -1 -y -i ${video_path} -vf mpdecimate,setpts=N/FRAME_RATE/TB ${short_path}`
|
||
// )
|
||
// } catch(err) {
|
||
// console.log('[❌] Failed to shorten screenrecording.mp4')
|
||
// }
|
||
|
||
// convert video to GIF
|
||
if (save_gif) {
|
||
try {
|
||
const BIN_NAME = '/Volumes/NVME/Users/squash/bin/ffmpeg'
|
||
const child = child_process.spawn(
|
||
BIN_NAME,
|
||
[
|
||
'-hide_banner',
|
||
'-loglevel', 'error',
|
||
'-ss', '3',
|
||
'-t', '10',
|
||
'-y',
|
||
'-i', SCREENRECORDING_PATH(page),
|
||
'-vf', "fps=10,scale=1024:-1:flags=bicubic,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse",
|
||
'-loop', '0',
|
||
SCREENRECORDGIF_PATH(page),
|
||
],
|
||
{
|
||
cwd: path.dirname(SCREENRECORDING_PATH(page)),
|
||
timeout: 60_000,
|
||
// stdio: [null, 'pipe', 'pipe'],
|
||
stdio: 'ignore',
|
||
detached: true, // run in background, don't block on response
|
||
},
|
||
)
|
||
await blockUntilExists(SCREENRECORDGIF_PATH(page), {min_bytes: 100, timeout: 40_000})
|
||
console.log(`[🎥] Saved screen-recording GIF with ffmpeg pid=${child.pid} (${duration/1000}s)...`.padEnd(82), prettyPath(SCREENRECORDGIF_PATH(page)))
|
||
|
||
const snap_dir = page_state.snapshot_dir
|
||
const legacy_path = path.join(snap_dir, 'media', 'screenrecording.gif')
|
||
await overwriteSymlink(SCREENRECORDGIF_PATH(page), legacy_path, {relative: snap_dir, search_limit: snap_dir})
|
||
} catch(err) {
|
||
console.log('[❌] Failed to convert video to GIF:', err)
|
||
}
|
||
}
|
||
|
||
return SCREENRECORDING_PATH(page)
|
||
}
|
||
return null
|
||
}
|
||
|
||
async function saveScreenshot(page, _page_state, {aspect_ratio=SCREENSHOT_ASPECT_RATIO, width=null, height=null, jpg_width=1440, jpg_quality=90, timeout=30_000}={}) {
|
||
try {await fs.promises.unlink(SCREENSHOT_PATH(page))} catch(err) {}
|
||
|
||
// setup width and height
|
||
width = width || DEFAULT_VIEWPORT.width
|
||
assert((typeof width === 'number') && width > 200)
|
||
height = height || Math.floor(width/aspect_ratio)
|
||
assert((typeof height === 'number') && height > 200)
|
||
|
||
console.log(`[📸] Saving full-page screenshot (${width}x${height}px)...`.padEnd(82), prettyPath(SCREENSHOT_PATH(page)))
|
||
|
||
// set width, height, and deviceScale factor: https://github.com/puppeteer/puppeteer/issues/1576
|
||
await page.setViewport({ ...DEFAULT_VIEWPORT, width, height, deviceScaleFactor: 2})
|
||
await page.bringToFront()
|
||
await wait(1_250) // page takes a sec settle after foregrounding and viewport update
|
||
|
||
// take lossless fullpage screenshot of 1920x1440+px (4:3+) -> ./screenshot.png
|
||
await page.screenshot({ path: SCREENSHOT_PATH(page), fullPage: true, type: 'png' })
|
||
|
||
// wait for the screenshot to be created, then set the viewport to the next size
|
||
await blockUntilExists(SCREENSHOT_PATH(page), {min_bytes: 100, timeout})
|
||
await wait(6_000) // puppeteer takes a while to finish writing png data when fullPage: true
|
||
|
||
const jpg_height = Math.floor(jpg_width/aspect_ratio)
|
||
await page.setViewport({ ...DEFAULT_VIEWPORT, width: jpg_width, height: jpg_height, deviceScaleFactor: 2})
|
||
await wait(1_250) // page takes a sec settle after foregrounding and viewport update
|
||
|
||
// WARNING: make sure you never try to create two screenshots at the same time (especially not fullpage screenshots)
|
||
// thats why there are all these delays here.
|
||
// screenshot creation messes up the whole viewport while it's running,
|
||
// and it writes bad/white empty screenshots if you try to make more than one concurrently
|
||
|
||
// take compressed screenshot of jpg_width*jpg_height (4:3) -> ./screenshot.jpg
|
||
await page.screenshot({
|
||
path: SCREENSHOT_JPG_PATH(page),
|
||
type: 'jpeg',
|
||
quality: jpg_quality,
|
||
clip: {
|
||
x: 0,
|
||
y: 0,
|
||
width: jpg_width,
|
||
height: jpg_height,
|
||
},
|
||
captureBeyondViewport: false,
|
||
});
|
||
await blockUntilExists(SCREENSHOT_JPG_PATH(page), {min_bytes: 100, timeout: timeout/2})
|
||
console.log(`[📸] Saved screenshot as screenshot.jpg (${jpg_width}x${jpg_height}px)...`.padEnd(82), prettyPath(SCREENSHOT_JPG_PATH(page)))
|
||
|
||
// reset viewport back to defaults
|
||
await wait(1_250)
|
||
await page.setViewport(DEFAULT_VIEWPORT)
|
||
|
||
// ALTERNATIVE METHOD based on cropping fullpage png and converting to jpg manually:
|
||
// import {PNG} from 'pngjs';
|
||
// import jpeg from 'jpeg-js';
|
||
// setTimeout(async () => {
|
||
// try {
|
||
// const screenshot_png = SCREENSHOT_PATH(page);
|
||
// const screenshot_jpg = SCREENSHOT_JPG_PATH(page)
|
||
// const jpg_max_height = height
|
||
// const jpg_quality = quality; // Adjust the quality as needed (0-100)
|
||
|
||
// fs.createReadStream(screenshot_png)
|
||
// .pipe(new PNG())
|
||
// .on('parsed', function () {
|
||
// const width = this.width;
|
||
// const height = this.height;
|
||
|
||
// let cropped_height = height;
|
||
// if (height > jpg_max_height) {
|
||
// cropped_height = jpg_max_height;
|
||
// }
|
||
|
||
// const cropped_bytes = new Uint8Array(width * cropped_height * 4);
|
||
// for (let y = 0; y < cropped_height; y++) {
|
||
// for (let x = 0; x < width; x++) {
|
||
// const idx = (width * y + x) << 2;
|
||
// cropped_bytes[idx] = this.data[idx];
|
||
// cropped_bytes[idx + 1] = this.data[idx + 1];
|
||
// cropped_bytes[idx + 2] = this.data[idx + 2];
|
||
// cropped_bytes[idx + 3] = this.data[idx + 3];
|
||
// }
|
||
// }
|
||
|
||
// const jpeg_obj = {
|
||
// data: cropped_bytes,
|
||
// width: width,
|
||
// height: cropped_height,
|
||
// };
|
||
|
||
// const jpeg_bytes = jpeg.encode(jpeg_obj, jpg_quality);
|
||
// fs.writeFileSync(screenshot_jpg, jpeg_bytes.data);
|
||
// console.log(`[📸] Saved screenshot as screenshot.jpg (${width}x${jpg_max_height}px)...`.padEnd(82), prettyPath(SCREENSHOT_JPG_PATH(page)))
|
||
// });
|
||
// } catch(err) {
|
||
// console.error('[X] Error while generating JPG screenshot', SCREENSHOT_JPG_PATH(page), err)
|
||
// }
|
||
// }, DELAY_BEFORE_JPG_CONVERSION)
|
||
|
||
// ALTERNATIVE METHOD TO WRITE SCREENSHOT JPG:
|
||
// await wait(5_000) // puppeteer takes a while to finish writing png data when fullPage: true
|
||
// if ((await page.evaluate('document.body.scrollHeight')) > max_height) {
|
||
// // if page exceeds max_height, save additional cropped screenshot as screenshot.top.png
|
||
// // (needed b.c. uncropped screenshot may have insane 1:20+ aspect ratio that is hard to use elsewhere)
|
||
// await page.screenshot({ path: SCREENSHOT_JPG_PATH(page), type: 'jpg', quality: 100})
|
||
// await wait(1_000) // page takes a sec settle after a screenshot
|
||
// }
|
||
|
||
return SCREENSHOT_PATH(page)
|
||
}
|
||
|
||
async function savePDF(page, _page_state, {timeout=30_000}={}) {
|
||
const url = page.url() || 'about:blank'
|
||
if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
|
||
|
||
const out_path = PDF_PATH(page)
|
||
console.log(`[📓] Saving print-as-PDF export...`.padEnd(82), prettyPath(out_path))
|
||
await page.bringToFront()
|
||
try {await fs.promises.unlink(PDF_PATH(page))} catch(err) {}
|
||
|
||
// await page.emulateMediaType('screen') // print as "@media(screen) instead of @media(print)"
|
||
|
||
// page.createPDFStream lets us to save larger PDFs than page.pdf() before crashing
|
||
// (streams to disk in chunks instead of all at once)
|
||
const pdf_stream = await page.createPDFStream({
|
||
timeout: timeout,
|
||
printBackground: true,
|
||
outline: true,
|
||
tagged: true,
|
||
format: 'A4',
|
||
displayHeaderFooter: false,
|
||
// margin: { top: '0.5cm', right: '1cm', bottom: '0.8cm', left: '1cm' },
|
||
})
|
||
const reader = pdf_stream.getReader()
|
||
|
||
// iterate through reader and append chunks to out_path
|
||
await fs.promises.rm(out_path, {force: true})
|
||
let num_bytes = 0
|
||
let error = '0 bytes written'
|
||
try {
|
||
while (true) {
|
||
const {done, value} = await reader.read()
|
||
if (done) break;
|
||
await fs.promises.appendFile(out_path, value)
|
||
num_bytes += value.length;
|
||
}
|
||
} catch(error) {
|
||
num_bytes = 0
|
||
}
|
||
|
||
if (!num_bytes) {
|
||
console.warn('[❌] Failed to save PDF', JSON.stringify(error, null, 4))
|
||
await fs.promises.rm(out_path, {force: true})
|
||
return null
|
||
}
|
||
|
||
return out_path
|
||
}
|
||
|
||
async function inlineShadowDOM(page, _page_state, {limit=100_000}={}) {
|
||
console.log(`[😎] Replacing Shadow DOM elements with inline HTML...`)
|
||
|
||
try {
|
||
const num_replaced = await page.evaluate((limit) => {
|
||
let num_replaced = 0
|
||
|
||
// Returns HTML of given shadow DOM.
|
||
const getShadowDomHtml = (shadowRoot) => {
|
||
let shadowHTML = '';
|
||
for (const el of shadowRoot.childNodes) {
|
||
shadowHTML += el.nodeValue || el.outerHTML;
|
||
}
|
||
return shadowHTML;
|
||
};
|
||
|
||
// Recursively replaces shadow DOMs with their HTML.
|
||
const replaceShadowDomsWithHtml = (rootElement) => {
|
||
if (num_replaced > limit) return
|
||
for (const el of rootElement.querySelectorAll('*')) {
|
||
if (el.shadowRoot) {
|
||
replaceShadowDomsWithHtml(el.shadowRoot);
|
||
el.innerHTML += getShadowDomHtml(el.shadowRoot);
|
||
}
|
||
}
|
||
num_replaced++
|
||
};
|
||
|
||
replaceShadowDomsWithHtml(document.body);
|
||
|
||
return num_replaced
|
||
}, limit)
|
||
// console.log(' √ replaced', num_replaced, 'Shadow DOM trees')
|
||
} catch(err) {
|
||
console.log('[⚠️] Inlining Shadow DOM failed', err)
|
||
}
|
||
}
|
||
|
||
async function saveAIQualityAssuranceResult(page, {original_url, version}) {
|
||
console.log(`[🧠] Analyzing screenshot with GPT-4o for QA checks...`.padEnd(82), prettyPath(AIQA_PATH(page)))
|
||
|
||
let screenshot_path = SCREENSHOT_PATH(page)
|
||
const screenshot_cropped_path = SCREENSHOT_JPG_PATH(page)
|
||
|
||
if (fs.existsSync(screenshot_cropped_path)) {
|
||
// screenshot is too tall to pass to openai, send cropped version instead
|
||
screenshot_path = screenshot_cropped_path
|
||
}
|
||
try {
|
||
await blockUntilExists(screenshot_path, {min_bytes: 100, timeout: 7_500})
|
||
} catch (err) {
|
||
console.warn('[❌] Failed to send screenshot to GTP-4o for analysis, no screenshot.{png,jpg} exists', err)
|
||
return null
|
||
}
|
||
var stdout = ''
|
||
var stderr = ''
|
||
let result = null
|
||
const PYTHON_BIN = path.join(__dirname, '.venv/bin/python')
|
||
const SCRIPT_PATH = path.join(__dirname, 'ai_qa.py')
|
||
await blockUntilExists(PYTHON_BIN, {min_bytes: 1, timeout: 250})
|
||
await blockUntilExists(SCRIPT_PATH, {min_bytes: 1, timeout: 250})
|
||
|
||
try {
|
||
var {stdout, stderr} = await exec(
|
||
`${PYTHON_BIN} ${SCRIPT_PATH} --attach '${screenshot_path}'`
|
||
)
|
||
result = JSON.parse(stdout.toString())
|
||
if (!result) throw 'Got empty result!'
|
||
result = {
|
||
TYPE: 'aiqa',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
...result,
|
||
}
|
||
} catch(parse_err) {
|
||
console.warn('[❌] Failed to get OpenAI analysis for screenshot.png', parse_err, stderr)
|
||
}
|
||
if (!(result || stdout)) {
|
||
return null
|
||
}
|
||
await overwriteFile(
|
||
AIQA_PATH(page),
|
||
result || stdout.toString(),
|
||
)
|
||
|
||
|
||
|
||
return result
|
||
}
|
||
|
||
async function saveYTDLP(page, {original_url, version}, {max_size='750m'}={}) {
|
||
console.log(`[🎥] Saving media with YT-DLP (<=${max_size})...`.padEnd(82), prettyPath(YTDLP_PATH(page)))
|
||
|
||
await fs.promises.mkdir(YTDLP_PATH(page), {recursive: true})
|
||
|
||
const cwd = YTDLP_PATH(page)
|
||
const bin_name = 'yt-dlp'
|
||
const timeout = 300_000 // 5min timeout
|
||
const args = [
|
||
'--restrict-filenames',
|
||
'--trim-filenames', '128',
|
||
'--write-description',
|
||
'--write-info-json',
|
||
'--write-annotations',
|
||
'--write-thumbnail',
|
||
'--no-call-home',
|
||
'--write-sub',
|
||
'--write-auto-subs',
|
||
'--convert-subs=srt',
|
||
'--yes-playlist',
|
||
'--continue',
|
||
'--no-abort-on-error',
|
||
'--ignore-errors',
|
||
'--geo-bypass',
|
||
'--add-metadata',
|
||
`--format=(bv*+ba/b)[filesize<=${max_size}][filesize_approx<=?${max_size}]/(bv*+ba/b)`,
|
||
'--no-check-certificate',
|
||
'--no-progress',
|
||
// `--cookies=${COOKIES_TXT_PATH}`, // using logged in cookies actually makes it fail more often, not sure why
|
||
original_url,
|
||
]
|
||
|
||
const {getResult, ...exec_info} = await saveExecResult(bin_name, args, {original_url, version}, {cwd, timeout})
|
||
|
||
return {getResult, ...exec_info}
|
||
}
|
||
|
||
async function saveGALLERYDL(page, {original_url, version}) {
|
||
console.log(`[🎥] Saving photos with gallery-dl...`.padEnd(82), prettyPath(GALLERYDL_PATH(page)))
|
||
|
||
await fs.promises.mkdir(GALLERYDL_PATH(page), {recursive: true})
|
||
|
||
const cwd = GALLERYDL_PATH(page)
|
||
const bin_name = 'gallery-dl'
|
||
const timeout = 300_000 // 5min timeout
|
||
const args = [
|
||
'--verbose',
|
||
'--write-metadata',
|
||
'--write-infojson',
|
||
'--write-tags',
|
||
'--sleep=1.5-2.5',
|
||
`--cookies=${COOKIES_TXT_PATH}`,
|
||
// '--no-check-certificate',
|
||
// `--directory=media`,
|
||
original_url,
|
||
]
|
||
|
||
const {getResult, ...exec_info} = await saveExecResult(bin_name, args, {original_url, version}, {cwd, timeout})
|
||
|
||
return {getResult, ...exec_info}
|
||
}
|
||
|
||
// async function saveWget(page, {original_url, version}) {
|
||
// console.log(`[⎒] Saving wget site clone...`.padEnd(82), prettyPath(WGET_PATH(page)))
|
||
|
||
// const args = [
|
||
// // ...
|
||
// ]
|
||
|
||
// spawn(
|
||
// 'wget',
|
||
// [
|
||
// ...args,
|
||
// original_url,
|
||
// ],
|
||
// {
|
||
// cwd: WGET_PATH(page),
|
||
// detached: true, // run in background, don't block on response
|
||
// stdio: 'ignore',
|
||
// timeout: 300_000, // 5min timeout
|
||
// },
|
||
// )
|
||
|
||
// return {path: WGET_PATH(page)}
|
||
// }
|
||
|
||
/**************** Asynchronous Archive Output Tasks ***************************/
|
||
|
||
type FaviconCandidate = {
|
||
url: string,
|
||
basename: string,
|
||
extension: string,
|
||
expected_mimetype: string,
|
||
}
|
||
|
||
const faviconFromDomain = (url) => {
|
||
// https://auth:pass@t.co:1234/a/bc123 -> https://auth:pass@t.co:1234/favicon.ico
|
||
const url_origin = (new URL(url)).origin
|
||
return {
|
||
url: url_origin ? `${url_origin}/favicon.ico` : null,
|
||
basename: 'favicon',
|
||
extension: undefined, // auto-detect extension at download time in case it redirects us to a png
|
||
expected_mimetype: 'image/', // only accept image/* to avoid saving html/txt error reponses as icon
|
||
} as FaviconCandidate
|
||
}
|
||
|
||
const faviconFromGoogle = (url, size=256) => {
|
||
// https://auth:pass@t.co:1234/a/bc123 -> https://www.google.com/s2.favicons?domain=t.co
|
||
const domain = url && (new URL(url)).hostname
|
||
return {
|
||
url: domain?.includes('.') ? `https://www.google.com/s2/favicons?sz=${size},domain=${domain}` : null,
|
||
basename: 'google_favicon',
|
||
extension: 'png',
|
||
expected_mimetype: 'image/png', // google always provides PNGs in response
|
||
} as FaviconCandidate
|
||
}
|
||
|
||
const faviconFromHtml = async (page) => {
|
||
// <link rel="icon" src="https://example.com/static/images/favicon.png"/> -> https://example.com/static/images/favicon.png
|
||
let url
|
||
try {
|
||
url = await page.$eval('link[rel*="icon"]', (elem) => elem?.href)
|
||
if (!url || !url.includes('://'))
|
||
url = null
|
||
} catch(err) {
|
||
url = null
|
||
// console.warn('Failed to find favicon tag in html', JSON.stringify(err, null, 4))
|
||
}
|
||
|
||
return {
|
||
url,
|
||
basename: 'favicon',
|
||
extension: undefined, // auto-detect extension at download time
|
||
expected_mimetype: 'image/', // accept any image/* mimetype at download time
|
||
} as FaviconCandidate
|
||
}
|
||
|
||
type FaviconResult = {
|
||
url: string,
|
||
num_bytes: number,
|
||
abspath?: string,
|
||
dir?: string,
|
||
filename?: string,
|
||
mimeType?: string,
|
||
}
|
||
|
||
async function saveFavicon(page, {original_url, main_response, version}) {
|
||
const dir = path.dirname(FAVICON_PATH(page))
|
||
const response_url = main_response?.url()
|
||
|
||
const favicon_downloads_to_try: {[key: string]: FaviconCandidate} = unique([
|
||
await faviconFromHtml(page),
|
||
faviconFromDomain(response_url),
|
||
faviconFromDomain(original_url),
|
||
faviconFromGoogle(response_url),
|
||
faviconFromGoogle(original_url),
|
||
].filter(({url}) => url), 'url')
|
||
|
||
const browser = await page.browser()
|
||
|
||
// let logs = []
|
||
// let errors = []
|
||
let output_files: {[key: string]: FaviconResult} = {}
|
||
|
||
for (const download_options of Object.values(favicon_downloads_to_try)) {
|
||
let result: FaviconResult = {num_bytes: 0, url: download_options.url}
|
||
// {url, num_bytes, abspath, dir, filename, basename, extension, mimeType}
|
||
try {
|
||
// try getting it with node-fetch first
|
||
const response = await fetch(download_options.url) as Response
|
||
const file_options = await detectFilename({...download_options, response, dir})
|
||
if (response.headers.get("content-length")) {
|
||
const favicon_stream = Readable.fromWeb(response.body as any)
|
||
await overwriteFile(file_options.abspath, favicon_stream)
|
||
result = {
|
||
...file_options,
|
||
num_bytes: parseInt(response.headers.get("content-length") || '0'),
|
||
mimeType: response.headers.get("content-type"),
|
||
}
|
||
} else {
|
||
throw 'Failed to download favicon with fetch()'
|
||
}
|
||
} catch(err) {
|
||
// console.warn('[!] Failed to get favicon with node-fetch', err)
|
||
// fallback to getting it by opening a new browser tab
|
||
result = await download({...download_options, browser, dir, page})
|
||
}
|
||
|
||
// logs.push(...(result.logs || []))
|
||
// errors.push(...(result.errors || []))
|
||
|
||
if (result.num_bytes) {
|
||
console.log(`[🌠] Saving page favicon (${result.url.substring(0, 35)}... ${result.mimeType})...`.padEnd(82), prettyPath(result.abspath))
|
||
output_files[result.filename] = result
|
||
break // break here stops after the first successful download, comment out to keep going instead
|
||
}
|
||
}
|
||
const output_file = Object.values(output_files).sort(file => file.num_bytes).at(-1)
|
||
const favicon_info = {
|
||
TYPE: 'favicon',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
succeeded: !!output_file,
|
||
// stdout: JSON.stringify(logs),
|
||
// stderr: JSON.stringify(errors),
|
||
favicon_url: output_file?.url,
|
||
favicon_urls: Object.keys(favicon_downloads_to_try),
|
||
favicon_files: Object.keys(output_files).map(fname => fname.replace(dir, '.')),
|
||
favicon_filename: output_file?.filename,
|
||
favicon_num_bytes: output_file?.num_bytes,
|
||
}
|
||
await overwriteFile(FAVICON_PATH(page), favicon_info)
|
||
|
||
return favicon_info
|
||
}
|
||
|
||
async function saveTitle(page, {original_url, version}) {
|
||
const title_from_browser = (await page.title()) || null
|
||
const title_from_js = await page.evaluate(() => document?.title || null)
|
||
const title_from_html = await page.evaluate(() => document?.querySelector('title')?.innerText || null)
|
||
const title_from_og = await page.evaluate(() => document?.querySelector('meta[property="og:title"]')?.getAttribute('content') || null)
|
||
|
||
// best guess at best title = longest title
|
||
const title = ([title_from_html, title_from_og, title_from_js, title_from_browser]
|
||
.filter(title => title)
|
||
.sort((a, b) => b.length - a.length)[0] || '')
|
||
.replaceAll('\n', ' ')
|
||
|
||
if (title?.length) {
|
||
console.log(`[📗] Saving page title (${title.substring(0, 40)})...`.padEnd(82), prettyPath(TITLE_PATH(page)))
|
||
await overwriteFile(TITLE_PATH(page), title)
|
||
}
|
||
|
||
const title_info = {
|
||
TYPE: 'title',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
title,
|
||
title_from_html,
|
||
title_from_og,
|
||
title_from_js,
|
||
title_from_browser,
|
||
}
|
||
const title_json_path = TITLE_PATH(page).replace('.txt', '.json')
|
||
await overwriteFile(title_json_path, title_info)
|
||
|
||
return title_info
|
||
}
|
||
|
||
async function saveRaw(page, {main_response}) {
|
||
const response = main_response
|
||
if (!response) {
|
||
console.warn('[⚠️] Failed to save page RAW bytes, main_response is null', response)
|
||
}
|
||
const dir = RAW_PATH(page)
|
||
await fs.promises.mkdir(dir, {recursive: true})
|
||
|
||
const {url, abspath, mimeType} = await detectFilename({page, response, dir})
|
||
|
||
console.log(`[🔟] Saving raw response bytes (${mimeType})...`.padEnd(82), prettyPath(abspath))
|
||
|
||
await download({page, response, abspath})
|
||
return abspath
|
||
}
|
||
|
||
async function saveSourceMaps(page, {original_url, version}) {
|
||
console.log(`[🐛] Saving source maps to ./responses/all/*.{js,css}.map...`)
|
||
|
||
const response_index_path = path.join(RESPONSES_PATH(page), 'index.jsonl')
|
||
const response_index = await fs.promises.readFile(response_index_path, 'utf-8')
|
||
|
||
const urls_to_download = []
|
||
|
||
for (const response of response_index.split('\n')) {
|
||
try {
|
||
const {url, extension} = JSON.parse(response)
|
||
if (['css', 'js'].includes(extension?.toLowerCase())) {
|
||
urls_to_download.push(url + '.map')
|
||
}
|
||
} catch(err) { continue }
|
||
}
|
||
|
||
// TODO: fix this, it needs to both after stopSavingMetadata and before stopSavingMetadata
|
||
// fix is to use traffic_log to get response url list instead of waiting for index.jsonl to be created
|
||
await page.evaluate(async (urls_to_download) => {
|
||
const promises = []
|
||
for (const sourcemap_url in urls_to_download) {
|
||
promises.push(fetch(sourcemap_url))
|
||
}
|
||
return Promise.allSettled(promises)
|
||
}, urls_to_download)
|
||
|
||
return {
|
||
TYPE: 'sourcemaps',
|
||
URL: original_url,
|
||
VERSION: version,
|
||
sourcemaps: urls_to_download,
|
||
}
|
||
}
|
||
|
||
async function saveRequests(page, {original_url, version, traffic_log}) {
|
||
console.log(`[📼] Saving requests log (${Object.keys(traffic_log).length})...`.padEnd(82), prettyPath(REQUESTS_PATH(page)))
|
||
|
||
const requests_info = {
|
||
TYPE: 'requests',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
requests: traffic_log,
|
||
}
|
||
|
||
await overwriteFile(REQUESTS_PATH(page), requests_info)
|
||
|
||
return requests_info
|
||
}
|
||
|
||
async function saveRedirects(page, {original_url, main_response, traffic_log, redirects, version}) {
|
||
const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
|
||
const main_response_traffic = traffic_log[main_request_id] || {}
|
||
|
||
const url_from_browser = await page.url() || null
|
||
const url_from_request = (
|
||
main_response?.request()?.url()
|
||
|| main_response_traffic['Network.requestWillBeSent']?.request?.url
|
||
|| null)
|
||
const url_from_response = (
|
||
main_response?.url()
|
||
|| main_response_traffic['Network.responseReceived']?.main_response?.url
|
||
|| null)
|
||
|
||
const http_redirects =
|
||
Object.values(traffic_log)
|
||
.filter(event => event['Network.requestWillBeSent']?.redirectResponse)
|
||
.map(event => event['Network.requestWillBeSent'])
|
||
.map(requestWillBeSent => ({
|
||
url: requestWillBeSent.request.url,
|
||
src: requestWillBeSent.redirectResponse.url,
|
||
status: requestWillBeSent.redirectResponse.status,
|
||
loaderId: requestWillBeSent.loaderId,
|
||
requestId: requestWillBeSent.requestId,
|
||
wallTime: requestWillBeSent.wallTime,
|
||
initiator: requestWillBeSent.initiator,
|
||
isMainFrame: (requestWillBeSent.loaderId == main_request_id),
|
||
}))
|
||
|
||
const url_parsed = new URL(url_from_response || url_from_request || url_from_browser)
|
||
|
||
const redirects_info = {
|
||
TYPE: 'redirects',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
url_parsed,
|
||
url_from_request,
|
||
url_from_response,
|
||
url_from_browser,
|
||
redirects_from_browser: redirects,
|
||
redirects_from_http: http_redirects,
|
||
}
|
||
console.log(`[🔗] Saving page redirects log (${http_redirects.length})...`.padEnd(82), prettyPath(REDIRECTS_PATH(page)))
|
||
|
||
await overwriteFile(REDIRECTS_PATH(page), redirects_info)
|
||
|
||
return redirects_info
|
||
}
|
||
|
||
async function saveHeaders(page, {original_url, version, traffic_log}) {
|
||
const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
|
||
const main_response_traffic = traffic_log[main_request_id] || {}
|
||
|
||
// combine base request with browser-added request headers
|
||
const request = {...main_response_traffic['Network.requestWillBeSent']?.request}
|
||
const request_extra_headers = main_response_traffic['Network.requestWillBeSentExtraInfo']?.headers || {}
|
||
request.headers = {...request.headers, ...request_extra_headers}
|
||
|
||
// combine base response with browser-added response headers
|
||
const response = {...main_response_traffic['Network.responseReceived']?.response}
|
||
const response_extra_headers = main_response_traffic['Network.responseReceivedExtraInfo']?.headers || {}
|
||
response.headers = {...response.headers, ...response_extra_headers}
|
||
|
||
const headers_info = {
|
||
TYPE: 'headers',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
request,
|
||
response,
|
||
}
|
||
|
||
const num_headers = Object.keys({...request.headers, ...response.headers}).length
|
||
if (num_headers) {
|
||
console.log(`[👾] Saving main request & response headers (${num_headers})...`.padEnd(82), prettyPath(HEADERS_PATH(page)))
|
||
await overwriteFile(HEADERS_PATH(page), headers_info)
|
||
}
|
||
|
||
return headers_info
|
||
}
|
||
|
||
async function saveSSL(page, {original_url, version, traffic_log}) {
|
||
const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
|
||
const main_response_traffic = traffic_log[main_request_id] || {}
|
||
|
||
const relevant_response_keys = [
|
||
'url',
|
||
'status',
|
||
'mimeType',
|
||
'connectionReused',
|
||
'remoteIPAddress',
|
||
'remotePort',
|
||
'fromServiceWorker',
|
||
'encodedDataLength',
|
||
'protocol',
|
||
'alternateProtocolUsage',
|
||
'securityState',
|
||
'securityDetails',
|
||
]
|
||
let ssl_info = Object.entries(main_response_traffic['Network.responseReceived']?.response || {})
|
||
.reduce((obj, [key, val]) => {
|
||
if (relevant_response_keys.includes(key)) {
|
||
obj[key] = val
|
||
}
|
||
return obj
|
||
}, {}) as any
|
||
|
||
// TODO: parse SSL certificate sha256 hash from chrome://system/#chrome_root_store
|
||
// const ssl_certificate = await client.send('Network.getCertificate', {origin: original_url})
|
||
// ssl_info.sslCertSha256 = '<unknown>'
|
||
|
||
ssl_info = {
|
||
TYPE: 'ssl',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
...ssl_info,
|
||
}
|
||
|
||
if (Object.keys(ssl_info).length-3) {
|
||
console.log(`[🔏] Saving page SSL details (${ssl_info?.securityDetails?.protocol})...`.padEnd(82), prettyPath(SSL_PATH(page)))
|
||
await overwriteFile(SSL_PATH(page), ssl_info)
|
||
}
|
||
|
||
return ssl_info
|
||
}
|
||
|
||
|
||
async function saveDOM(page, {original_url, version}) {
|
||
const html = await page.content();
|
||
console.log(`[📖] Saving DOM dump (${html.length})...`.padEnd(82), prettyPath(DOM_PATH(page)))
|
||
const html_with_header =
|
||
`<!-- Saved by ArchiveBox TYPE=dom VERSION=${version} URL=${original_url} -->\n${html}`
|
||
await overwriteFile(DOM_PATH(page), html_with_header)
|
||
return DOM_PATH(page)
|
||
}
|
||
|
||
async function saveBodyText(page, _page_state) {
|
||
const innerText = await page.evaluate(() => document?.body?.innerText);
|
||
|
||
if (innerText?.length) {
|
||
console.log(`[📃] Saving body text (${innerText.length})...`.padEnd(82), prettyPath(BODYTEXT_PATH(page)))
|
||
await overwriteFile(BODYTEXT_PATH(page), innerText)
|
||
}
|
||
|
||
// // alternative method: emulate Ctrl+A, Ctrl+C (sometimes gets more than body.innerText)
|
||
// const innerText = await page.$eval('*', (el) => {
|
||
// const selection = window.getSelection();
|
||
// const range = document.createRange();
|
||
// range.selectNode(el);
|
||
// selection.removeAllRanges();
|
||
// selection.addRange(range);
|
||
// return window.getSelection().toString();
|
||
// });
|
||
|
||
return innerText
|
||
}
|
||
|
||
async function savePandoc(page, { original_url, version }) {
|
||
console.log(`[📒] Converting DOM HTML to markdown with Pandoc...`.padEnd(82), prettyPath(PANDOC_PATH(page)))
|
||
|
||
let dom_paths = [DOM_PATH(page), SINGLEFILE_PATH(page)].filter(fs.existsSync)
|
||
if (!dom_paths) return null
|
||
const dom_path = dom_paths[0]
|
||
|
||
var stdout: string = ''
|
||
var stderr: string = ''
|
||
let result: any = null
|
||
const BIN_NAME = 'pandoc'
|
||
// pandoc --from html --to markdown_github --citeproc --wrap=none --highlight-style=kate
|
||
const args = [
|
||
BIN_NAME,
|
||
'--from=html',
|
||
'--to=markdown_github',
|
||
'--wrap=none',
|
||
'--citeproc',
|
||
'--highlight-style=kate',
|
||
`--output='${PANDOC_PATH(page)}'`,
|
||
dom_path,
|
||
]
|
||
try {
|
||
;({ stdout, stderr } = await exec(args.join(' ')));
|
||
stdout = stdout.toString().trim()
|
||
if (!stdout) throw 'Got empty result!'
|
||
result = {
|
||
TYPE: 'pandoc',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
cmd: args,
|
||
markdown_file: PANDOC_PATH(page),
|
||
}
|
||
} catch (parse_err) {
|
||
console.warn('[❌] Failed to run Pandoc HTML to MD conversion', parse_err, stderr)
|
||
}
|
||
if (!stdout) {return null}
|
||
await overwriteFile(
|
||
PANDOC_PATH(page),
|
||
stdout,
|
||
)
|
||
|
||
// pandoc --from markdown_github --to html --citeproc --wrap=none --highlight-style=kate
|
||
const reverse_conversion_args = [
|
||
BIN_NAME,
|
||
'--from=markdown_github',
|
||
'--to=html',
|
||
'--wrap=none',
|
||
'--citeproc',
|
||
'--highlight-style=kate',
|
||
`--output='${PANDOC_PATH(page).replace('.md', '.html')}'`,
|
||
PANDOC_PATH(page),
|
||
]
|
||
try {
|
||
; ({ stdout, stderr } = await exec(reverse_conversion_args.join(' ')));
|
||
stdout = stdout.toString().trim()
|
||
if (!stdout) throw 'Got empty result!'
|
||
result = {
|
||
...result,
|
||
html_file: PANDOC_PATH(page).replace('.md', '.html'),
|
||
}
|
||
} catch (parse_err) {
|
||
console.warn('[❌] Failed to run Pandoc MD to HTML conversion', parse_err, stderr)
|
||
}
|
||
if (!result) { return null }
|
||
await overwriteFile(
|
||
PANDOC_PATH(page).replace('.md', '.html'),
|
||
result,
|
||
)
|
||
|
||
return result
|
||
}
|
||
|
||
async function saveReadability(page, {original_url, version}) {
|
||
const url = await page.url()
|
||
let html = ''
|
||
let article = null
|
||
try {
|
||
html = await page.content()
|
||
if (html.length > 14_000_000) {
|
||
console.warn('[⚠️] Truncating readability article text because html is too long...', html.length)
|
||
html = html.substring(0, 13_900_000)
|
||
}
|
||
const virtualConsole = new VirtualConsole()
|
||
const dom = new JSDOM(html, {url, virtualConsole})
|
||
const reader = new Readability(dom.window.document);
|
||
article = reader.parse()
|
||
} catch(err) {
|
||
console.warn(`[❌] Failed to get readability article text`)
|
||
return null
|
||
}
|
||
if (article) {
|
||
console.log(`[📜] Saving readability article text (${article.textContent?.length})...`.padEnd(82), prettyPath(READABILITY_PATH(page)))
|
||
const {content, textContent, ...metadata} = article
|
||
if (content.trim()) {
|
||
await overwriteFile(READABILITY_PATH(page).replace('.json', '.html'), content);
|
||
}
|
||
if (textContent.trim()) {
|
||
await overwriteFile(READABILITY_PATH(page).replace('.json', '.txt'), textContent);
|
||
}
|
||
const readability_info = {
|
||
TYPE: 'readability',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
...metadata,
|
||
}
|
||
await overwriteFile(READABILITY_PATH(page), readability_info)
|
||
return readability_info
|
||
}
|
||
return null
|
||
}
|
||
|
||
async function saveAccessibility(page, {original_url, version}) {
|
||
// get accessibility tree
|
||
const accessibility_tree = await page.accessibility.snapshot({interestingOnly: true});
|
||
// console.log(accessibility_tree);
|
||
|
||
// get iframe tree
|
||
const iframes = []
|
||
function dumpFrameTree(frame, indent='>') {
|
||
iframes.push(indent + frame.url());
|
||
for (const child of frame.childFrames()) {
|
||
dumpFrameTree(child, indent + '>');
|
||
}
|
||
}
|
||
dumpFrameTree(page.mainFrame(), '');
|
||
// console.log(iframes)
|
||
|
||
// generate simple table-of-contents of all the key html elements (e.g. h1, h2, h3, article, main, etc.)
|
||
const outline = await page.evaluate(() => {
|
||
const headings = []
|
||
for (const elem of [...document.querySelectorAll("h1, h2, h3, h4, h5, h6, a, header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe")] as HTMLElement[]) {
|
||
|
||
// skip a tags that aren't named anchors
|
||
if (elem.tagName.toLowerCase() == 'a' && !(elem as HTMLAnchorElement).name) continue
|
||
|
||
// e.g. article #main-article
|
||
const elem_id = ((typeof elem.id === 'string' && elem.id) || (elem as HTMLAnchorElement).name || elem.ariaLabel || elem.role || '')
|
||
const elem_classes = elem.className.trim().split(' ').slice(0, 3).join(' .') || ''
|
||
const elem_action = (elem as any).action?.split('/')?.slice(-1)?.join('/')
|
||
const summary = elem.innerText.length > 128
|
||
? `${elem.innerText?.slice(0, 128)}...`
|
||
: elem.innerText
|
||
|
||
let prefix = ''
|
||
let title = (elem_id ? `#${elem_id}` : '')
|
||
if (!title && elem_classes) title = `.${elem_classes}`
|
||
if (elem_action) title = `${title} /${elem_action}`
|
||
if (summary) title = `${title}: ${summary}`
|
||
|
||
// if elem is a header, prepend a #### prefix based on its level
|
||
const level = Number(elem.tagName.toLowerCase().replace('h', ''))
|
||
if (!isNaN(level)) {
|
||
prefix = '#'.repeat(level)
|
||
title = elem.innerText || elem_id || elem_classes
|
||
} else {
|
||
// set prefix to element's breadcrumb path
|
||
let node = elem
|
||
const parents = [elem.tagName?.toLowerCase().trim()]
|
||
while (node) {
|
||
// add each parent element's name to the path
|
||
// const elem_type = node.tagName?.toLowerCase().trim() || ''
|
||
// if (elem_type && !['div', 'span', 'p', 'body', 'html'].includes(elem_type)) {
|
||
// parents.unshift(elem_type);
|
||
// }
|
||
parents.unshift('') // add emptystring to abbreviate path as >>>> istead of main>article>header>div>...
|
||
node = node.parentNode as HTMLElement
|
||
}
|
||
prefix = parents.join('>')
|
||
}
|
||
// strip all repeated whitespace and newlines
|
||
title = title.replaceAll('\n', ' ').replace(/\s+/g, ' ').trim()
|
||
|
||
if (prefix) {
|
||
headings.push(`${prefix} ${title}`)
|
||
}
|
||
}
|
||
// console.log(headings.join('\n'))
|
||
return headings
|
||
})
|
||
|
||
console.log(`[🩼] Saving accessibility outline (${Object.keys(accessibility_tree).length})...`.padEnd(82), prettyPath(ACCESIBILITY_PATH(page)))
|
||
// console.log(outline.filter(line => line.startsWith('#')).join('\n'))
|
||
|
||
const accessibility_info = {
|
||
TYPE: 'accessibility',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
iframes,
|
||
headings: outline,
|
||
tree: accessibility_tree,
|
||
}
|
||
|
||
await overwriteFile(
|
||
ACCESIBILITY_PATH(page),
|
||
accessibility_info,
|
||
)
|
||
|
||
return accessibility_info
|
||
}
|
||
|
||
async function saveSEO(page, {original_url, version}) {
|
||
// collect all <meta name="title" property="og:title" content="Page Title for SEO | Somesite.com"> tags into dict
|
||
const seo_vars = await page.evaluate(() =>
|
||
[...document.querySelectorAll('meta')]
|
||
.map(tag => ({key: tag.getAttribute('name') || tag.getAttribute('property') || '', value: tag.getAttribute('content') || ''}))
|
||
.filter(obj => obj.key && obj.value)
|
||
.sort((a, b) => a.value.length - b.value.length)
|
||
.reduce((acc, node) => {acc[node.key] = node.value; return acc}, {})
|
||
)
|
||
|
||
const seo_info = {
|
||
TYPE: 'seo',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
...seo_vars,
|
||
}
|
||
|
||
const num_vars = Object.keys(seo_vars).length
|
||
if (num_vars) {
|
||
console.log(`[🔎] Saving page SEO metadata (${num_vars})...`.padEnd(82), prettyPath(SEO_PATH(page)))
|
||
await overwriteFile(SEO_PATH(page), seo_info)
|
||
}
|
||
|
||
return seo_info
|
||
}
|
||
|
||
async function saveOutlinks(page, {original_url, version}) {
|
||
// TODO: slow to iterate over all elements so many times, perhaps we can collapse everything down into one loop
|
||
|
||
|
||
// Regular expression that matches syntax for a link (https://stackoverflow.com/a/3809435/117030):
|
||
const LINK_REGEX = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
|
||
|
||
const filterW3Urls = (urls) =>
|
||
urls.filter(url =>
|
||
url && !url.startsWith('http://www.w3.org/'))
|
||
|
||
const filterDataUrls = (urls) =>
|
||
urls.filter(url =>
|
||
url && !url.startsWith('data:'))
|
||
|
||
const html = await page.content();
|
||
|
||
const raw = html?.match(LINK_REGEX) || [];
|
||
|
||
const hrefs = await page.$$eval(
|
||
"pierce/a[href]",
|
||
elems => elems
|
||
.map(elem => elem.href)
|
||
.filter(url => url),
|
||
);
|
||
|
||
const links = await page.$$eval(
|
||
"pierce/link[href]",
|
||
elems => elems
|
||
.map(({rel, href}) => ({rel, href}))
|
||
.filter(({rel, href}) => rel !== 'stylesheet')
|
||
.reduce((collection, entry) => {
|
||
const {rel, href} = entry
|
||
const non_empty_rel = collection[href]?.rel || rel
|
||
collection[href] = {rel: non_empty_rel, href}
|
||
return collection
|
||
}, {})
|
||
);
|
||
|
||
const iframes = await page.$$eval(
|
||
"pierce/iframe[src]",
|
||
elems => elems.map(iframe => iframe.src).filter(url => url)
|
||
);
|
||
|
||
const images = await page.$$eval(
|
||
"pierce/img[src]",
|
||
elems => elems.map(img => img.src).filter(url => url && !url.startsWith('data:'))
|
||
);
|
||
|
||
|
||
const css_images = await page.$$eval(
|
||
"pierce/*",
|
||
elems => elems
|
||
.map(elem => {
|
||
const css_url_ptn = /url\(\s*?['"]?\s*?(\S+?)\s*?["']?\s*?\)/i;
|
||
const bg_img = window.getComputedStyle(elem, null).getPropertyValue('background-image')
|
||
const bg_url = css_url_ptn.exec(bg_img)
|
||
return bg_url ? bg_url[1] : null
|
||
})
|
||
)
|
||
|
||
const css_stylesheets = await page.$$eval(
|
||
"pierce/link[rel=stylesheet]",
|
||
elems => elems.map(elem => elem.href).filter(url => url)
|
||
);
|
||
|
||
const js_scripts = await page.$$eval(
|
||
"pierce/script[src]",
|
||
elems => elems.map(elem => elem.src).filter(url => url)
|
||
);
|
||
|
||
const outlinks_info = {
|
||
TYPE: 'outlinks',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
raw: [...new Set(filterDataUrls(filterW3Urls(raw)))],
|
||
hrefs: [...new Set(filterDataUrls(hrefs))],
|
||
links: [...Object.values(links)],
|
||
iframes: [...new Set(iframes)],
|
||
images: [...new Set(filterDataUrls(images))],
|
||
css_images: [...new Set(filterDataUrls(css_images))],
|
||
css_stylesheets: [...new Set(filterDataUrls(css_stylesheets))],
|
||
js_scripts: [...new Set(filterDataUrls(js_scripts))],
|
||
}
|
||
|
||
if (raw?.length || hrefs?.length || links?.length || iframes?.length) {
|
||
console.log(`[🖇️] Saving page outgoing links (${raw?.length || hrefs?.length})...`.padEnd(82+1), prettyPath(OUTLINKS_PATH(page)))
|
||
|
||
await overwriteFile(OUTLINKS_PATH(page), outlinks_info)
|
||
}
|
||
return outlinks_info
|
||
}
|
||
|
||
|
||
async function saveAuthStorage(page, {client, version, original_url}) {
|
||
const url = original_url || await page.url()
|
||
if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
|
||
if (!SAVE_AUTH_STORAGE) return null
|
||
|
||
// const cookies = JSON.stringify(await page.cookies()); // doesnt include httponly cookies
|
||
const auth_from_browser = {
|
||
cookies: (await client.send('Network.getAllCookies')).cookies,
|
||
localStorage: {},
|
||
sessionStorage: {},
|
||
}
|
||
|
||
// attempt to load localStorage and sessionStorage from browser (may fail in some cases https://github.com/puppeteer/puppeteer/issues/921)
|
||
try {
|
||
auth_from_browser.localStorage = (await page.evaluate(() =>
|
||
JSON.parse(JSON.stringify({[window.location.origin]: window.localStorage}))))
|
||
} catch(err) {
|
||
throw `Failed to get page window.localStorage! ${err}`
|
||
}
|
||
try {
|
||
auth_from_browser.sessionStorage = (await page.evaluate(() =>
|
||
JSON.parse(JSON.stringify({[window.location.origin]: window.sessionStorage}))))
|
||
} catch(err) {
|
||
throw `Failed to get page window.sessionStorage! ${err}`
|
||
}
|
||
|
||
// WARNING: small TOCTTOU gap between this read-before-write and the write below
|
||
// can possibly overwrite changes made by other processes in this gap
|
||
const auth_on_disk = await loadAuthStorage(page, {client}, {apply: false})
|
||
|
||
const cookies = dedupeCookies([...auth_on_disk.cookies, ...auth_from_browser.cookies])
|
||
|
||
const auth_info = {
|
||
TYPE: 'auth',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
cookies: cookies,
|
||
sessionStorage: merge(auth_on_disk.sessionStorage, auth_from_browser.sessionStorage),
|
||
localStorage: merge(auth_on_disk.localStorage, auth_from_browser.localStorage),
|
||
}
|
||
// console.log(`[⛙] Merged ${auth_on_disk.cookies.length} existing + ${auth_from_browser.cookies.length} new -> ${auth_info.cookies.length} cookies`)
|
||
|
||
console.log(`[🍪] Saving cookies/localStorage/sessionStorage (${auth_info.cookies.length})...`.padEnd(82), prettyPath(AUTH_JSON_PATH));
|
||
await overwriteFile(AUTH_JSON_PATH, auth_info);
|
||
|
||
// Write to cookies.txt file using tough-cookie + @root/file-cookie-store
|
||
await saveCookiesTxt(cookies)
|
||
|
||
return auth_info
|
||
}
|
||
|
||
async function saveCookiesTxt(cookies) {
|
||
const cookies_store = new FileCookieStore(COOKIES_TXT_PATH, {auto_sync: false, lockfile: false})
|
||
const cookie_jar = new ToughCookie.CookieJar(cookies_store)
|
||
cookie_jar.setCookieAsync = util.promisify(cookie_jar.setCookie)
|
||
cookies_store.saveAsync = util.promisify(cookies_store.save)
|
||
for (const cookie of cookies) {
|
||
const cookie_for_tough = {
|
||
domain: cookie.domain,
|
||
path: cookie.path,
|
||
key: cookie.name,
|
||
value: cookie.value,
|
||
expires: (new Date(cookie.expires * 1000)).toISOString(),
|
||
hostOnly: cookie.domain.startsWith('.'),
|
||
secure: cookie.secure,
|
||
}
|
||
// console.log('COOKIE_FOR_TOUGH_TXT', cookie_for_tough)
|
||
const parsed_cookie = ToughCookie.Cookie.fromJSON(cookie_for_tough)
|
||
// console.log('COOKIE_FOR_TOUGH_TXT_TO_DUMP', parsed_cookie)
|
||
try {
|
||
// assemble a fake URL just to satisfy ToughCookieJar's requirement of having a URL at set time
|
||
let url = cookie.secure ? 'https://' : 'http://'
|
||
if (cookie.domain.startsWith('.')) {
|
||
url = url + cookie.domain.slice(1)
|
||
} else {
|
||
url = url + cookie.domain
|
||
}
|
||
if (cookie.sourcePort && ![80, 443].includes(cookie.sourcePort)) {
|
||
url = `${url}:${cookie.sourcePort}`
|
||
}
|
||
url = `${url}${cookie.path || ''}`
|
||
await cookie_jar.setCookieAsync(parsed_cookie, url, {ignoreError: true})
|
||
} catch(err) {
|
||
console.error('[❌] Failed to dump browser cookie for cookies.txt...', cookie_for_tough, '->', parsed_cookie, err)
|
||
}
|
||
}
|
||
console.log(`[🍪] Saving cookies TXT (${cookies.length})...`.padEnd(82), prettyPath(COOKIES_TXT_PATH));
|
||
await cookies_store.saveAsync()
|
||
}
|
||
|
||
async function saveMetrics(page, {original_url, version, start_time, start_ts, traffic_log, redirects}) {
|
||
const end_time = (new Date()).toISOString()
|
||
const end_ts = Date.now()
|
||
const metrics_info = {
|
||
TYPE: 'metrics',
|
||
VERSION: version,
|
||
URL: original_url,
|
||
...(await page.metrics()),
|
||
start_time,
|
||
start_ts,
|
||
end_time,
|
||
end_ts,
|
||
duration: (end_ts - start_ts),
|
||
num_requests: traffic_log.length,
|
||
num_redirects: Object.keys(redirects).length -1,
|
||
}
|
||
|
||
console.log(`[🏎️] Saving final summary + timing metrics...`.padEnd(82+1), prettyPath(METRICS_PATH(page)))
|
||
await overwriteFile(METRICS_PATH(page), metrics_info)
|
||
|
||
return metrics_info
|
||
}
|
||
|
||
|
||
/******************************************************************************/
|
||
/******************************************************************************/
|
||
|
||
/**************************** Utility Helpers *********************************/
|
||
|
||
|
||
function hashCode(str) {
|
||
// get a simple integer hash for a given string (based on java String#hashCode)
|
||
// useful only for throwaway nonces / easy deterministic random identifiers, not a replacement for sha256
|
||
let hash = 0;
|
||
for (let i=0; i<str.length; i++) {
|
||
hash = str.charCodeAt(i) + ((hash << 5) - hash);
|
||
}
|
||
return Math.abs(hash)
|
||
}
|
||
|
||
function unique(iter, key: string | ((any, number) => string)='id') {
|
||
// uniqueify an array of objects by a value within them, key can be name of attr or getter function
|
||
// > iter = [{id: 1}, {id: 2}, {id: 1}]
|
||
// > Object.entries(iter) = [
|
||
// [ '0', { id: 1 } ],
|
||
// [ '1', { id: 2 } ],
|
||
// [ '2', { id: 1 } ] ]
|
||
// > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}}
|
||
|
||
// > iter = {a1: {id: 1}, b2: {id: 2}, a3: {id: 1}}
|
||
// > Object.entries(iter) = [
|
||
// [ 'a1', { id: 1 } ],
|
||
// [ 'b2', { id: 2 } ],
|
||
// [ 'a3', { id: 1 } ]
|
||
// ]
|
||
// > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}}
|
||
|
||
const key_type = (typeof key)
|
||
if (!['function', 'string'].includes(key_type))
|
||
throw 'key must be either a string lookup key or a function (obj, idx) => return unique_id'
|
||
|
||
const key_func = (key_type === 'string')
|
||
? (entry_obj, idx) => entry_obj[(key as string)]
|
||
: (entry_obj, idx) => (key as Function)(entry_obj, idx) // otherwise key is a callback func
|
||
|
||
const seen = {}
|
||
for (const [idx, entry_obj] of Object.entries(iter)) {
|
||
const unique_id = key_func(entry_obj, idx)
|
||
if (seen[unique_id] === undefined) {
|
||
seen[unique_id] = entry_obj
|
||
}
|
||
}
|
||
|
||
return seen
|
||
}
|
||
|
||
const wait = (ms: number) => new Promise(res => {
|
||
if (ms > 10_000) {
|
||
console.debug(`[⏲️] Waiting ${Math.round(ms/1000)}s...`)
|
||
}
|
||
setTimeout(res, ms)
|
||
})
|
||
|
||
const TimeoutError = Symbol()
|
||
const withTimeout = (promise, ms) => {
|
||
// run a promise with a time limit, raises a TimeoutError if it fails
|
||
let timer
|
||
return Promise.race([
|
||
promise,
|
||
new Promise((_r, reject) =>
|
||
timer = setTimeout(reject, ms, TimeoutError)
|
||
),
|
||
]).finally(() => clearTimeout(timer))
|
||
}
|
||
|
||
const MAX_VALID_DATE = new Date('2150-01-01T00:00:00.000Z')
|
||
const MIN_VALID_DATE = new Date('2010-01-01T00:00:00.000Z')
|
||
const UNIX_EPOCH_DATE = new Date(0)
|
||
|
||
const validateDate = (date, {min=MIN_VALID_DATE, max=MAX_VALID_DATE, singleton=UNIX_EPOCH_DATE}={}) => {
|
||
assert((date instanceof Date), `Got invalid type for Date: ${typeof date} ${date} (expected Date)`)
|
||
assert(String(date) !== 'Invalid Date', `Got invalid value for Date: ${typeof date} ${date}`)
|
||
if (Number(date) === Number(singleton)) return date // epoch singleton is always valid
|
||
assert(date < max, `Got Date that was higher than MAX_VALID_DATE=${max}`)
|
||
assert(date > min, `Got Date that was lower than MIN_VALID_DATE=${min}`)
|
||
return date
|
||
}
|
||
|
||
const parseVersionDateStr = (yyyymmddtime) => {
|
||
// YYYYMMDDhhmmssxxx or YYYYMMDDhhmmss or YYYYMMDDhhmm or YYYYMMDD -> Date
|
||
const is_only_numbers = /^\d+$/.test(yyyymmddtime.replace('.', ''))
|
||
assert(is_only_numbers, `Non-numeric characters in YYYYMMDD date are not allowed: ${yyyymmddtime} (while trying YYYYMMDDhhmmssxxx format)`)
|
||
|
||
const num_digits = String(yyyymmddtime).split('.')[0].length
|
||
assert([17, 14, 12, 8].includes(num_digits), `Got invalid number of digits (${num_digits}) in YYYYMMDD date: ${yyyymmddtime} (while trying YYYYMMDDhhmmssxxx format)`)
|
||
|
||
const [_all, yyyy, mm, dd, hr, min, sec, ms] = /^(\d{4})(\d{2})(\d{2})(\d{2})?(\d{2})?(\d{2})?(\d{3})?$/.exec(yyyymmddtime)
|
||
assert(yyyy && mm && dd, `Could not find YYYYMMDD`)
|
||
const time_error_msg = `Detected YYYYMMDD[hhmm[ss[xxxx]]] but time segment is invalid ${hr}:${min || '__'}:${ms || '___'}`
|
||
if (ms) assert(hr && min && sec, time_error_msg)
|
||
if (sec) assert(hr && min, time_error_msg)
|
||
if (min) assert(hr, time_error_msg)
|
||
if (hr) assert (min, time_error_msg)
|
||
|
||
const iso_str = `${yyyy}-${mm}-${dd}T${hr || '00'}:${min || '00'}:${sec || '00'}.${ms || '00'}Z`
|
||
const parsed_date = new Date(iso_str)
|
||
|
||
return validateDate(parsed_date) // 1970-01-01T00:00:00.000Z (ISO format)
|
||
}
|
||
|
||
const parseTimestampDateStr = (timestamp) => {
|
||
// 1709724291000 or 1709724291000.000 or 1709724291 or 1709724291.000 -> Date
|
||
timestamp = String(timestamp)
|
||
const is_only_numbers = /^\d+$/.test(timestamp.replace('.', ''))
|
||
assert(is_only_numbers, `Got invalid characters in timstamp: ${timestamp} (while trying xxxxxxxxxxxxx format)`)
|
||
|
||
const num_digits = String(timestamp).split('.')[0].length
|
||
assert([13, 10, 1].includes(num_digits), `Got invalid number of digits (${num_digits}) in timestamp: ${timestamp} (while trying xxxxxxxxxxxxx format)`)
|
||
|
||
let parsed_date = null
|
||
|
||
if (num_digits === 13) {
|
||
parsed_date = new Date(Number(timestamp)) // 1709724291000 (unix timestamp w/ milliseconds)
|
||
} else if (num_digits === 10) {
|
||
parsed_date = new Date(Number(timestamp) * 1000) // 1709724291 (unix timestamp w/ seconds)
|
||
} else if (num_digits === 1) {
|
||
assert(String(timestamp) === '0', `Got invalid single-digit timestamp: ${timestamp} (while trying xxxxxxxxxxxxx format or 0 for UNIX epoch)`)
|
||
parsed_date = UNIX_EPOCH_DATE
|
||
}
|
||
return validateDate(parsed_date)
|
||
}
|
||
|
||
const parseISODateStr = (iso_str) => {
|
||
// 1970-01-01T00:00:00.000Z -> Date
|
||
const num_digits = String(iso_str).length
|
||
assert([24, 19, 16, 10].includes(num_digits), `Got invalid number of digits (${num_digits}) in ISO date: ${iso_str} (while trying 1970-01-01T00:00:00.000Z format)`)
|
||
|
||
const parsed_date = new Date(iso_str)
|
||
return validateDate(parsed_date)
|
||
}
|
||
|
||
const parseDate = (date) => {
|
||
// date === undefined => use today/now
|
||
// date === null => use unix epoch 0 aka 1970-01-01T00:00:00.000Z
|
||
// date *= YYYYMMDDHHMMSS => use a version date string (e.g. 20010131235958)
|
||
// date *= 1234567... => use a timestmap (e.g. 1709724291000)
|
||
// date *= 1970-01-01T... => use iso datetime (e.g. 1970-01-01T00:00:00.000Z)
|
||
// returns -> Date
|
||
|
||
if (date === undefined) {
|
||
return (new Date()) // today (2024-05-29T22:02:34.682Z) aka timestamp=1717020154682
|
||
}
|
||
if (date === null || date == 0) {
|
||
return UNIX_EPOCH_DATE // unix epoch (1970-01-01T00:00:00.000Z) aka timestamp=0
|
||
}
|
||
if (date instanceof Date) {
|
||
return validateDate(date) // JS date Date('1970-01-01T00:00:00.000Z')
|
||
}
|
||
|
||
if ((typeof date) === 'number') {
|
||
date = String(date) // unix timestamp e.g. 1717020154682
|
||
}
|
||
assert((typeof date) === 'string', `Tried to parse date but got unsupported type ${(typeof date)}: ${date}`)
|
||
|
||
const errors = [`Failed to parse Date from string: ${date}`]
|
||
try {
|
||
return parseVersionDateStr(date)
|
||
} catch(err) { errors.push(err) }
|
||
try {
|
||
return parseTimestampDateStr(date)
|
||
} catch(err) { errors.push(err) }
|
||
try {
|
||
return parseISODateStr(date)
|
||
} catch(err) { errors.push(err) }
|
||
|
||
throw errors.join('\n')
|
||
}
|
||
|
||
const versionStrFromDate = (date, {withDate=true, withTime=true, withSeconds=true, withMilliseconds=false}={}) => {
|
||
// takes Date, returns YYYYMMDDHHMMSSXXX or YYYYMMDDHHMMSS or YYYYMMDDHHMM or YYYYMMDD
|
||
const parsed_date = parseDate(date)
|
||
|
||
const [date_iso, time_iso] = parsed_date.toISOString().split('T') // ['2001-01-31', '23:59:58.090Z']
|
||
|
||
const components_to_use = []
|
||
if (withDate) {
|
||
components_to_use.push(date_iso.replaceAll('-', '')) // '20010131'
|
||
}
|
||
if (withTime) {
|
||
const [hr, min, sec, ms] = time_iso.replace('Z', '').replace('.', ':').split(':') // ['23', '59', '58', '090']
|
||
components_to_use.push(hr)
|
||
components_to_use.push(min)
|
||
if (withSeconds) {
|
||
components_to_use.push(sec)
|
||
if (withMilliseconds) {
|
||
components_to_use.push(ms)
|
||
}
|
||
}
|
||
}
|
||
assert(components_to_use.length, 'At least one of {withDate, withTime} must be set.')
|
||
|
||
const final_str = components_to_use.join('') // 20010131235958
|
||
|
||
assert(parseVersionDateStr(final_str)) // sanity check to make sure it parses correctly
|
||
|
||
return final_str
|
||
}
|
||
|
||
// test date functions:
|
||
// console.log(parseDate('20120131'))
|
||
// console.log(versionStrFromDate(parseDate('20120131')))
|
||
// console.log(versionStrFromDate(parseDate('0')))
|
||
// console.log(versionStrFromDate(parseDate(0)))
|
||
// console.log(versionStrFromDate(parseDate(null)))
|
||
// console.log(versionStrFromDate())
|
||
// console.log(versionStrFromDate(parseDate('20120131235859090')))
|
||
// console.log(versionStrFromDate(parseDate('1970-01-01T00:00:00.000Z')))
|
||
// console.log(versionStrFromDate(parseDate('2024-12-01T00:00')))
|
||
// console.log(versionStrFromDate(parseDate('2024-12-01'), {withTime: false}))
|
||
|
||
const prettyPath = (path) => {
|
||
// return a pretty-printable path where the abspath of the data dir is replaced with /data for brevity/privacy
|
||
return path.replace(DATA_DIR, './data')
|
||
}
|
||
|
||
const pathIsHidden = (relpath) => {
|
||
// check if a path or any of the directories above it are hidden (e.g. ./some/.dir/abc or ./.DS_Store)
|
||
|
||
// make sure test path behaves like an abspath (avoids edge-cases messing up relpaths on '' or '.' or './')
|
||
let test_path = relpath
|
||
if (test_path.startsWith('./'))
|
||
test_path = test_path.substring(2)
|
||
if (!test_path.startsWith('/'))
|
||
test_path = path.join('/', test_path)
|
||
|
||
// iterate through parents, checking if any parent is hidden until we reach /
|
||
while (test_path !== '/') {
|
||
const basename = path.basename(test_path)
|
||
if (basename.startsWith('.')) {
|
||
// console.log('PATH IS HIDDEN', relpath)
|
||
return true
|
||
}
|
||
// otherwise set test_path to parent dir and repeat
|
||
test_path = path.dirname(test_path)
|
||
}
|
||
return false
|
||
}
|
||
|
||
const pathDepth = (child_path, relative_to='.') => {
|
||
// get the number of directory hops deep a child path is relative to '.' (or a given parent)
|
||
|
||
if (child_path.startsWith('/') && !relative_to.startsWith('/')) {
|
||
// if child_path is absolute, then relative_to must be absolute as well otherwise depth will be depth all the way to the / root
|
||
relative_to = fs.realpathSync(relative_to)
|
||
}
|
||
if (relative_to.startsWith('/') && !child_path.startsWith('/')) {
|
||
// same deal, either both paths have to be relative, or both have to be absolute
|
||
child_path = fs.realpathSync(child_path)
|
||
}
|
||
const relative_path_to_root = path.relative(relative_to, child_path)
|
||
const num_hops_down = relative_path_to_root.split('/').length
|
||
return num_hops_down
|
||
}
|
||
|
||
interface DirentWithExtras extends fs.Dirent {
|
||
relpath: string,
|
||
abspath: string,
|
||
reldepth: number,
|
||
}
|
||
|
||
async function getDirEntries(dir_path, {pwd=null, recursive=true, includeHidden=false, includeFiles=true, includeDirs=true, includeLinks=false, filter=null, maxdepth=-1}={}) {
|
||
// get the list of all sub-paths under a given path recursively
|
||
|
||
// console.log('GETTING DIRECTORY ENTRIES', {dir_path, pwd, recursive, includeHidden, includeFiles, includeDirs, maxdepth})
|
||
|
||
pwd = pwd || dir_path
|
||
let dir_abspath = dir_path
|
||
|
||
if (!dir_abspath.startsWith(pwd)) {
|
||
dir_abspath = path.join(pwd, dir_abspath)
|
||
}
|
||
|
||
assert(fs.existsSync(dir_abspath), `Tried to get directory listing for dir that doesn't exist! ${prettyPath(dir_abspath)}`)
|
||
|
||
return (await fs.promises.readdir(dir_abspath, { recursive, withFileTypes: true }))
|
||
.map((dirent: DirentWithExtras) => {
|
||
// filter combined with map because relpath is re-used in both operations
|
||
const relpath = path.join(path.relative(pwd, dirent.parentPath), dirent.name)
|
||
// console.log('CALCULATED RELATIVE PATH', relpath)
|
||
const abspath = path.join(dir_abspath, relpath)
|
||
const basename = path.basename(dirent.name)
|
||
if (!includeLinks && dirent.isSymbolicLink()) return null
|
||
if (!includeFiles && dirent.isFile()) return null
|
||
if (!includeDirs && dirent.isDirectory()) return null
|
||
if (!includeHidden && pathIsHidden(relpath)) return null
|
||
|
||
dirent.relpath = relpath
|
||
dirent.abspath = abspath
|
||
dirent.reldepth = pathDepth(relpath)
|
||
// console.log('RELATIVE DEPTH MEASURED', prettyPath(dir_abspath), prettyPath(relpath), dirent.reldepth)
|
||
|
||
if (maxdepth >= 0) {
|
||
if ((dirent.reldepth-1) > maxdepth) return null
|
||
}
|
||
|
||
if ((typeof filter) === 'function') {
|
||
const should_keep = filter({abspath, relpath, basename, dirent})
|
||
if (!should_keep) {
|
||
// console.log('FILTER EXCLUDED RESULT', {abspath, relpath, basename, dirent})
|
||
return null
|
||
}
|
||
}
|
||
|
||
return relpath
|
||
})
|
||
.filter(Boolean)
|
||
.sort() as string[]
|
||
}
|
||
|
||
|
||
async function getTotalSize(dir_or_file_path, {pwd=null, _cache=null, filter=null, subfiles=null}={}) {
|
||
// get the total size in bytes of a file or directory (recursively adds up file sizes within directory)
|
||
|
||
// check _cache first
|
||
if (_cache && (dir_or_file_path in _cache))
|
||
return _cache[dir_or_file_path]
|
||
|
||
// make sure dir_or_file_path is under pwd
|
||
pwd = pwd || path.dirname(dir_or_file_path)
|
||
let abspath = dir_or_file_path
|
||
if (!dir_or_file_path.startsWith(pwd)) {
|
||
abspath = path.join(pwd, dir_or_file_path)
|
||
}
|
||
|
||
// if it's a file, stat it and return the size
|
||
// console.log('CALCUALTED ABSPATH', {abspath, dir_or_file_path, pwd})
|
||
const dirent = await fs.promises.stat(abspath)
|
||
if (dirent.isFile()) {
|
||
// console.log('CALCULATING FILE SIZE subfile=', prettyPath(abspath))
|
||
return dirent.size
|
||
}
|
||
|
||
// if it's not a file and not a directory, give up, dont try to size special files like FIFO/socket/etc.
|
||
if (!dirent.isDirectory()) return 0
|
||
|
||
// if it's a directory, size is the sum of all the sizes of files within
|
||
// console.log('CALCULATING SUBDIR SIZE subdir=', prettyPath(abspath))
|
||
let total_bytes = 0
|
||
const files_within = subfiles || await getDirEntries(dir_or_file_path, {
|
||
pwd,
|
||
recursive: true,
|
||
includeDirs: false,
|
||
includeFiles: true,
|
||
filter,
|
||
})
|
||
for (const subpath of files_within) {
|
||
total_bytes += await getTotalSize(subpath, {pwd, _cache, filter})
|
||
}
|
||
return total_bytes
|
||
}
|
||
|
||
|
||
async function getDirSizes(dir_path, {pwd=null, subfiles=null, withRoot=true, filter=null, maxdepth=-1}={}) {
|
||
// get the size of a directory and all the files within (recursively) as a number of bytes
|
||
// dir_path: path absolute or relative path of the directory you want size info for
|
||
// pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to
|
||
// subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
|
||
// withRoot: bool include a summary entry for the root dir_path dir in the list as '.'
|
||
// filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
|
||
// maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
|
||
|
||
assert((await fs.promises.stat(dir_path)).isDirectory(), `Tried to calculate directory sizes but path is not a directory! ${dir_path}`)
|
||
pwd = pwd || dir_path
|
||
|
||
// {'.': 246, 'example.json': 123, 'example2.txt': 123}
|
||
const sizes = {}
|
||
|
||
// first collect the list of all sub-files recursively and calculate their sizes individually
|
||
const files_within = subfiles || await getDirEntries(dir_path, {
|
||
pwd,
|
||
recursive: true,
|
||
includeDirs: false,
|
||
includeFiles: true,
|
||
// dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir sizes
|
||
// it never makes sense to ignore subfiles beyond a certain depth for size calculation
|
||
filter, // filter is allowed though, useful to calculcate size of some subset of files that match a pattern
|
||
})
|
||
for (const subpath of files_within) {
|
||
sizes[subpath] = await getTotalSize(subpath, {pwd, _cache: sizes, filter})
|
||
}
|
||
|
||
// then calculate the top-level directory total as the sum of all the file sizes under it
|
||
const total_size = Object.values(sizes).reduce((a: number, b: number) => a + b, 0)
|
||
|
||
// then calculate the subtotals of all the sub-directories
|
||
const subdirs_within = await getDirEntries(dir_path, {pwd, recursive: true, includeDirs: true, includeFiles: false, filter, maxdepth})
|
||
for (const subpath of subdirs_within) {
|
||
sizes[subpath] = await getTotalSize(subpath, {pwd, _cache: sizes, filter}) // uses _cache to avoid re-computing
|
||
}
|
||
|
||
// if maxdepth is passed, filter results to only include paths shallower than max depth
|
||
if (maxdepth >= 0) {
|
||
for (const subpath of Object.keys(sizes)) {
|
||
if (pathDepth(subpath) > maxdepth) {
|
||
delete sizes[subpath]
|
||
}
|
||
}
|
||
}
|
||
|
||
// set total_size last so it appears at the bottom of the object in logs for convenience
|
||
if (withRoot) {
|
||
sizes['.'] = total_size
|
||
}
|
||
|
||
return sizes
|
||
}
|
||
|
||
|
||
async function getLargestPath(path_a, path_b) {
|
||
// compare two files/directories and return the largest one of the two (calculating size recursively)
|
||
|
||
path_a = await fs.promises.realpath(path_a)
|
||
path_b = await fs.promises.realpath(path_b)
|
||
const size_a = await getTotalSize(path_a)
|
||
const size_b = await getTotalSize(path_b)
|
||
|
||
// console.log('COMPARING', prettyPath(path_a), size_a, ' ', prettyPath(path_b), size_b)
|
||
|
||
if (size_a > size_b) return path_a
|
||
return path_b
|
||
}
|
||
|
||
async function findCommonAncestor(target_abspath, symlink_abspath, {relative=true, search_limit=DATA_DIR}: {relative?: boolean | string, search_limit?: string}={}) {
|
||
// given a target path and a symlink path, find the common ancestor path they both share
|
||
// (searches recursively through absolute path parent directories until a common dir is found, up to search_limit)
|
||
|
||
search_limit = await fs.promises.realpath(search_limit)
|
||
|
||
let relative_dir = search_limit
|
||
if ((typeof relative) === 'boolean') {
|
||
// if start dir is default, set it to symlinks directory path
|
||
if (relative) {
|
||
relative_dir = path.dirname(symlink_abspath)
|
||
} else {
|
||
relative_dir = search_limit
|
||
}
|
||
} else if ((typeof relative) === 'string') {
|
||
// if start dir is a string, get its absolute path
|
||
relative_dir = relative as string
|
||
} else {
|
||
throw `Got invalid type for relative path during common ancestor search: ${relative}`
|
||
}
|
||
|
||
if ((await fs.promises.stat(relative_dir)).isFile()) {
|
||
// if start dir is a file, set it to its parent dir path
|
||
relative_dir = path.dirname(relative_dir)
|
||
}
|
||
assert(
|
||
(await fs.promises.stat(relative_dir)).isDirectory(),
|
||
`Tried to find common ancestor starting from invalid search directory:\n 🔗 ${prettyPath(symlink_abspath)}\n -> ${prettyPath(target_abspath)}\n Error: search dir does not exist or is not a directory: ❌ ${prettyPath(relative_dir)}`,
|
||
)
|
||
|
||
const symlink_filename = path.basename(symlink_abspath)
|
||
const target_filename = path.basename(target_abspath)
|
||
const symlink_parent_abspath = await fs.promises.realpath(path.dirname(symlink_abspath))
|
||
const target_parent_abspath = await fs.promises.realpath(path.dirname(target_abspath))
|
||
const search_dir_abspath = await fs.promises.realpath(relative_dir)
|
||
|
||
let closest_common_ancestor = search_dir_abspath
|
||
|
||
const isAncestorCommon = (ancestor) => (
|
||
target_parent_abspath.startsWith(ancestor)
|
||
&& symlink_parent_abspath.startsWith(ancestor))
|
||
|
||
// check if both src and target start with the same ancestor path
|
||
while (closest_common_ancestor !== search_limit) {
|
||
if (isAncestorCommon(closest_common_ancestor)) break
|
||
else {
|
||
// otherwise go up one directory and try again
|
||
// console.log(' ...going up a directory', prettyPath(closest_common_ancestor)+'/..')
|
||
closest_common_ancestor = path.dirname(closest_common_ancestor)
|
||
}
|
||
}
|
||
|
||
assert(
|
||
isAncestorCommon(closest_common_ancestor),
|
||
`Tried to create relative symlink but could not find common ancestor:\n 🔗 ${prettyPath(symlink_abspath)}\n -> ${prettyPath(target_abspath)}\n Error: target path and symlink path are not both under:\n ❌ ${prettyPath(closest_common_ancestor)}`,
|
||
)
|
||
|
||
const symlink_to_ancestor_relpath = path.relative(symlink_parent_abspath, closest_common_ancestor) // ../../..
|
||
const target_from_ancestor_relpath = path.join(path.relative(closest_common_ancestor, target_parent_abspath), target_filename) // 'archive/19999999.23423523'
|
||
const symlink_to_target_relpath = path.join(symlink_to_ancestor_relpath, target_from_ancestor_relpath) // '../../../archive/19999999.23423523'
|
||
|
||
return {
|
||
closest_common_ancestor,
|
||
search_dir_abspath,
|
||
|
||
target_abspath,
|
||
target_filename,
|
||
target_from_ancestor_relpath,
|
||
|
||
symlink_abspath,
|
||
symlink_filename,
|
||
symlink_to_ancestor_relpath,
|
||
symlink_to_target_relpath,
|
||
}
|
||
}
|
||
|
||
interface StatsWithExtras extends fs.Stats {
|
||
abspath: string
|
||
relpath?: string
|
||
reldepth?: number
|
||
}
|
||
|
||
async function blockUntilExists(file_path, {timeout=7_500, min_bytes=0}={}) {
|
||
// wait up to timeout seconds until file we expect to exist appears on the filesystem
|
||
// (used to handle eventual consistency in network filesystems where we need a delay after writing before reads show up)
|
||
const interval = 250
|
||
const max_tries = timeout / interval
|
||
let tries = 0
|
||
|
||
let abspath = null
|
||
while (tries < max_tries) {
|
||
try {
|
||
const abspath = await fs.promises.realpath(file_path)
|
||
assert(fs.existsSync(abspath))
|
||
|
||
const dirent = await fs.promises.stat(abspath) as StatsWithExtras
|
||
dirent.abspath = abspath
|
||
|
||
if (min_bytes && (dirent.size < min_bytes)) {
|
||
assert(dirent.size >= 1)
|
||
// this is a valid warning but unfortunately its too common to bother showing:
|
||
// console.warn(`[⚠️] Expected file to be >=${Math.round(min_bytes/1000)}kb but was only ${dirent.size/1000}kb:`, prettyPath(file_path))
|
||
}
|
||
|
||
return dirent
|
||
} catch(err) {
|
||
const waited = (tries * interval)
|
||
if (waited === 5_000) {
|
||
console.warn(`[⚠️] Waited >${waited/1000}s for file to appear (is filesystem or bg task running slow?):`, prettyPath(file_path))
|
||
}
|
||
await wait(interval)
|
||
tries++
|
||
}
|
||
}
|
||
throw `Expected file does not exist after ${timeout/1000}s: ${prettyPath(file_path)}`
|
||
}
|
||
|
||
async function overwriteSymlink(target_path, symlink_path, {relative=true, mkdirs=false, search_limit=DATA_DIR, timeout=5_000}: {relative?: boolean | string, mkdirs?: boolean, search_limit?: string, timeout?: number}={}) {
|
||
// create a symlink from symlink_path -> target_path
|
||
// relative: true => symlink is created as a relative link by default (it will auto-find the closest common ancestor dir, often DATA_DIR)
|
||
// mkdirs: true => optionally creates symlink parent dirs automatically)
|
||
|
||
// make sure target file actually exists first
|
||
let target_dirent
|
||
try {
|
||
target_dirent = await blockUntilExists(target_path, {timeout})
|
||
} catch(err) {
|
||
throw `Tried to create symlink pointing to file that does not exist:\n 🔗 ${prettyPath(symlink_path)}\n -> ❌ ${prettyPath(target_path)}\n ${err}`
|
||
}
|
||
const target_abspath = target_dirent.abspath
|
||
const target_filename = path.basename(target_abspath)
|
||
const target_parent_abspath = path.dirname(target_abspath)
|
||
|
||
// make sure target is a valid file or directory and not a special character/block device/other weird file
|
||
const target_is_dir = target_dirent.isDirectory()
|
||
const target_is_file = target_dirent.isFile()
|
||
assert(target_is_dir || target_is_file, `Tried to create symlink to an unsupported file type:\n 🔗 ${prettyPath(symlink_path)}\n -> ❌ ${prettyPath(target_path)} (expected file or directory)`)
|
||
|
||
// create symlink file parent directories if needed
|
||
const symlink_filename = path.basename(symlink_path)
|
||
const symlink_parent_dir = path.dirname(symlink_path)
|
||
if (mkdirs) {
|
||
await fs.promises.mkdir(symlink_parent_dir, {recursive: true})
|
||
}
|
||
try {
|
||
assert((await fs.promises.stat(symlink_parent_dir)).isDirectory())
|
||
} catch(err) {
|
||
throw `Tried to create symlink in a directory that doesn't exist:\n 🔗 ${symlink_parent_dir}❌/${symlink_filename}\n -> ${target_path}\n ${err}`
|
||
}
|
||
const symlink_parent_abspath = await fs.promises.realpath(symlink_parent_dir)
|
||
const symlink_abspath = path.join(symlink_parent_abspath, symlink_filename)
|
||
|
||
// determine nearest common ancestor between symlink dir and target dir
|
||
const {
|
||
closest_common_ancestor,
|
||
symlink_to_ancestor_relpath,
|
||
target_from_ancestor_relpath,
|
||
symlink_to_target_relpath,
|
||
} = await findCommonAncestor(target_abspath, symlink_abspath, {relative, search_limit})
|
||
|
||
// set final target path to abspath or relative path depending on {relative} options
|
||
let target_path_final
|
||
if (relative) {
|
||
// make symlink into relative link (based on closest common ancestor dir between symlink_abspath and target_abspath)
|
||
target_path_final = symlink_to_target_relpath
|
||
// console.log(' 🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), `(as relative link: ${target_path_final})`)
|
||
} else {
|
||
// make symlink into an absolute path (verbatim passed target_path)
|
||
target_path_final = target_path
|
||
// console.log(' 🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), '(as absolute path)')
|
||
}
|
||
|
||
// remove any existing symlink at destination if there is already one there
|
||
const random_nonce = crypto.randomBytes(16).toString('hex').substring(0, 8)
|
||
const symlink_temp_path = `${symlink_abspath}.${random_nonce}.dup`
|
||
try { await fs.promises.unlink(symlink_abspath) } catch(err) {}
|
||
try { await fs.promises.unlink(symlink_temp_path) } catch(err) {}
|
||
|
||
// create the symlink and check that it works after creation
|
||
let created_symlink = null
|
||
try {
|
||
created_symlink = symlink_temp_path
|
||
await fs.promises.symlink(target_path_final, symlink_temp_path)
|
||
created_symlink = symlink_abspath
|
||
await fs.promises.rename(symlink_temp_path, symlink_abspath)
|
||
} catch(err) {
|
||
if (String(err).includes('EISDIR')) {
|
||
// console.warn('[⚠️] Tried to create symlink on top of existing directory', prettyPath(symlink_abspath))
|
||
|
||
// no real recourse in this situation, and its too noisy to log every time this happens
|
||
// it's also not always safe to move the dir out of the way, so better to just fail silently here, leaving:
|
||
// ${symlink_abspath}.${random_nonce}.dup
|
||
} else {
|
||
console.warn('[⚠️] Failed to create symlink', prettyPath(created_symlink), err)
|
||
}
|
||
}
|
||
|
||
let dirent
|
||
try {
|
||
dirent = await blockUntilExists(created_symlink, {timeout, min_bytes: 0})
|
||
// best we can do here is just check that it exists ^, trying to check that it has the exact expected abspath that we set is bad, because its a race condition:
|
||
// assert(dirent.abspath == target_abspath) // its often already overwritten by later activity, so final abspath may already be different
|
||
} catch(err) {
|
||
throw `Symlink created but does not seem to resolve to intended file:\n 🔗 ${symlink_path}\n -> ❌ ${target_path}\n actual=${dirent?.abspath}\n expected=${target_abspath}\n ${err}`
|
||
}
|
||
|
||
return {
|
||
symlink_path,
|
||
symlink_abspath: created_symlink,
|
||
symlink_filename: path.basename(created_symlink),
|
||
symlink_parent_abspath,
|
||
symlink_to_ancestor_relpath,
|
||
symlink_to_target_relpath,
|
||
|
||
target_path,
|
||
target_abspath,
|
||
target_filename,
|
||
target_parent_abspath,
|
||
target_from_ancestor_relpath,
|
||
target_path_final,
|
||
target_is_dir,
|
||
target_is_file,
|
||
target_is_relative: Boolean(relative),
|
||
|
||
closest_common_ancestor,
|
||
}
|
||
}
|
||
|
||
// test symlink and common ancestor finding
|
||
// console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo.json', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo2.json'))
|
||
// console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', {relative: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'}))
|
||
// console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269'))
|
||
// console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/favorite_snapshots/1709724410.19269', {relative: false, mkdirs: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'}))
|
||
|
||
|
||
|
||
async function overwriteDir(path) {
|
||
// delete any existing folder at the destination path (important otherwise we may create a folder inside an existing folder/symlink)
|
||
try {
|
||
await fs.promises.rm(path, { recursive: true, force: true });
|
||
} catch(err) {}
|
||
|
||
await fs.promises.mkdir(path, {recursive: true})
|
||
|
||
return path
|
||
}
|
||
|
||
async function overwriteFile(path, contents, options={encoding: 'utf8', flag: 'w', flush: false, block: true}) {
|
||
// write any JS value to a fresh file (e.g. String, Buffer, WritableStream, etc. anything JSON-serializable)
|
||
|
||
const block_until_created = options.block || true
|
||
delete options.block
|
||
|
||
try {
|
||
// delete any existing symlink/file present at the destination path
|
||
// (important otherwise we may write into an existing symlink by accident)
|
||
await fs.promises.unlink(path)
|
||
} catch(err) {}
|
||
|
||
try {
|
||
let nonce = 1
|
||
while ((await fs.promises.stat(path)).isDirectory()) {
|
||
// if we try to write a file to a path that already has a directory in that location
|
||
// (common when trying to write response JSON e.g. http://www.instagram.com/api/graphql returns json and www.instagram.com/api/graphql/abc returns json)
|
||
path = path.replace(`.${nonce-1}`, '') + `.${nonce}`
|
||
nonce++;
|
||
if (nonce > 20) throw `Too many conflicting files while trying to write to ${prettyPath(path)}`
|
||
}
|
||
} catch(err) {
|
||
if (!String(err).includes('no such file or directory')) {
|
||
console.warn('[⚠️] Warning: Problem with conflicting directory at while trying to write file', err)
|
||
}
|
||
}
|
||
|
||
// refuse writing undefined/null/function because its likely an error and not intended
|
||
const content_is_null = (contents === null) || (contents === undefined)
|
||
const content_is_func = (typeof contents === 'function')
|
||
if (content_is_null || content_is_func) {
|
||
throw `Cannot write ${typeof contents} ${contents} to file: ${path}`
|
||
}
|
||
|
||
// Numbers, BigInts, and Booleans can be cast to strings, then wrt
|
||
const content_is_primitive = ['number', 'bigint', 'boolean'].includes(typeof contents)
|
||
if (content_is_primitive) {
|
||
contents = String(contents)
|
||
await fs.promises.writeFile(path, contents, options as any)
|
||
if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
|
||
return path
|
||
}
|
||
|
||
// Strings and Buffers can be written directly to file
|
||
const content_is_string = (typeof contents === 'string' || contents instanceof String)
|
||
const content_is_buffer = Buffer.isBuffer(contents)
|
||
if (content_is_string || content_is_buffer) {
|
||
await fs.promises.writeFile(path, contents, options as any)
|
||
if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
|
||
return path
|
||
}
|
||
|
||
// WritableStream objects can be piped into file
|
||
const content_is_stream = (contents?.pipe)
|
||
if (content_is_stream) {
|
||
const stream_byte_length = contents.writableLength
|
||
const dest_file = fs.createWriteStream(path);
|
||
await finished(contents.pipe(dest_file))
|
||
if (block_until_created) await blockUntilExists(path, {min_bytes: stream_byte_length})
|
||
return path
|
||
}
|
||
|
||
// Objects and Arrays can be JSON-stringified then written into file
|
||
const content_is_obj = (Array.isArray(contents) || typeof contents === 'object')
|
||
if (content_is_obj) {
|
||
contents = JSON.stringify(contents, null, 4)
|
||
await fs.promises.writeFile(path, contents, options as any)
|
||
if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
|
||
return path
|
||
}
|
||
throw `Cannot write contents of type ${typeof contents} to file: ${path} < ${contents}`
|
||
}
|
||
|
||
|
||
async function saveExecResult(bin, args=null, {original_url, version}, {cwd='.', timeout=60_000, ...spawn_options}={}) {
|
||
assert(bin)
|
||
assert(original_url && original_url.includes('://'))
|
||
assert(version)
|
||
|
||
const BIN_NAME = bin // 'yt-dlp'
|
||
const ARGS = args || [] // ['--some-arg', '--some-other-arg']
|
||
const CWD = cwd || process.cwd() // '.'
|
||
const TIMEOUT = 300_000 // 5min timeout
|
||
const PATH = process.env.PATH
|
||
|
||
await fs.promises.mkdir(cwd, {recursive: true})
|
||
|
||
// quick-n-dirty dump of cmd to bash script, but this might be better: https://github.com/nodejs/node/issues/34840#issuecomment-677402567
|
||
const cmd_log_str = `#!/usr/bin/env bash
|
||
TYPE="${BIN_NAME}"
|
||
URL="${original_url}"
|
||
VERSION="${version}"
|
||
|
||
TIMEOUT=${TIMEOUT}
|
||
CWD="${CWD}"
|
||
PATH="${PATH}:$PATH"
|
||
|
||
${BIN_NAME} ${ARGS.map(arg => JSON.stringify(arg)).join(' ')}
|
||
`
|
||
const cmd_log = path.join(cwd, 'cmd.sh')
|
||
await overwriteFile(cmd_log, cmd_log_str)
|
||
|
||
const stdout_log = fs.createWriteStream(path.join(cwd, 'stdout.log'))
|
||
const stderr_log = fs.createWriteStream(path.join(cwd, 'stderr.log'))
|
||
|
||
const start_date = new Date()
|
||
const start_ts = Number(start_date)
|
||
const start_time = start_date.toISOString()
|
||
|
||
const child = child_process.spawn(
|
||
BIN_NAME,
|
||
ARGS,
|
||
{
|
||
cwd: CWD,
|
||
timeout: TIMEOUT, // 5min timeout
|
||
stdio: [null, 'pipe', 'pipe'], // </dev/null >./stdout.log 2>./stderr.log
|
||
// detached: true, // run in background, don't block on response
|
||
...(spawn_options || {}),
|
||
},
|
||
)
|
||
child.stdout.setEncoding('utf8')
|
||
child.stdout.pipe(stdout_log)
|
||
child.stderr.setEncoding('utf8')
|
||
child.stderr.pipe(stderr_log)
|
||
|
||
const exec_info = {
|
||
TYPE: BIN_NAME,
|
||
URL: original_url,
|
||
VERSION: version,
|
||
bin_name: BIN_NAME,
|
||
args: ARGS,
|
||
timeout: TIMEOUT,
|
||
hostname: os.hostname(),
|
||
bin_paths: PATH,
|
||
ppid: process.pid,
|
||
pid: child.pid,
|
||
start_ts,
|
||
start_time,
|
||
end_time: null,
|
||
end_ts: null,
|
||
duration: null,
|
||
returncode: null,
|
||
log_files: {},
|
||
output_files: {},
|
||
}
|
||
|
||
// promise that resolves when the command is finished executing
|
||
// TODO: refactor to use withTimeout
|
||
const getResult = (timeout=TIMEOUT) =>
|
||
new Promise((resolve, reject) => {
|
||
const loop = setInterval(() => {
|
||
if (exec_info.end_time) {
|
||
clearInterval(loop)
|
||
clearTimeout(timer)
|
||
resolve(exec_info)
|
||
}
|
||
}, 100)
|
||
|
||
const timer = setTimeout(() => {
|
||
clearInterval(loop)
|
||
if (!exec_info.end_time) {
|
||
reject(new Error(`Process ${BIN_NAME} did not finish within TIMEOUT=${TIMEOUT}`))
|
||
}
|
||
}, timeout);
|
||
})
|
||
|
||
const logFilesFilter = ({relpath}) =>
|
||
['cmd.sh', 'stdout.log', 'stderr.log'].includes(relpath)
|
||
|
||
const outputFilesFilter = ({relpath}) =>
|
||
!['cmd.sh', 'stdout.log', 'stderr.log', 'index.json'].includes(relpath)
|
||
|
||
const getOutputFiles = async (filter=outputFilesFilter) => {
|
||
return await getDirInfo(CWD, {filter, withHelpers: false, withRoot: false, maxdepth: 6})
|
||
}
|
||
|
||
child.on('close', async (returncode) => {
|
||
const end_date = new Date()
|
||
exec_info.returncode = returncode
|
||
exec_info.pid = child.pid
|
||
exec_info.end_ts = Number(end_date)
|
||
exec_info.end_time = end_date.toISOString()
|
||
exec_info.duration = exec_info.end_ts - exec_info.start_ts
|
||
exec_info.log_files = await getOutputFiles(logFilesFilter)
|
||
exec_info.output_files = await getOutputFiles(outputFilesFilter)
|
||
|
||
const end_metadata = `
|
||
# END_TIME="${exec_info.end_time}"
|
||
# DURATION=${exec_info.duration}
|
||
# RETURNCODE=${exec_info.returncode }
|
||
`
|
||
await fs.promises.appendFile(cmd_log, end_metadata)
|
||
|
||
// write exec_info json (which includes file list) to CWD/index.json
|
||
await overwriteFile(path.join(CWD, 'index.json'), exec_info)
|
||
})
|
||
// child.unref() // dont wait for child process to close
|
||
|
||
const start_metadata = `
|
||
#################### LAST RUN LOG ####################
|
||
# HOSTNAME="${exec_info.hostname}"
|
||
# PPID=${exec_info.ppid}
|
||
# PID=${exec_info.pid}
|
||
# START_TIME="${exec_info.start_time}"
|
||
`
|
||
await fs.promises.appendFile(cmd_log, start_metadata)
|
||
|
||
return {
|
||
...exec_info,
|
||
getResult,
|
||
}
|
||
}
|
||
|
||
const HASH_CACHE = {}
|
||
|
||
async function sha256File(file_path: string, {pwd=null}: {pwd?: string}={}) {
|
||
return new Promise((resolve, reject) => {
|
||
pwd = pwd || path.dirname(file_path);
|
||
if (!file_path.startsWith(pwd)) {
|
||
file_path = path.join(pwd, file_path);
|
||
}
|
||
|
||
const dirent = fs.statSync(file_path);
|
||
const abspath = fs.realpathSync(file_path);
|
||
const cache_key = `${abspath}:${dirent.size}:${dirent.mtimeMs}`; // PATH:SIZE:LAST_MODIFIED_TIME
|
||
if (cache_key in HASH_CACHE) {
|
||
resolve(HASH_CACHE[cache_key]);
|
||
}
|
||
|
||
const hash = crypto.createHash('sha256');
|
||
const rs = fs.createReadStream(abspath);
|
||
rs.on('error', reject);
|
||
rs.on('data', chunk => hash.update(chunk));
|
||
rs.on('end', () => {
|
||
const final_hash = hash.digest('hex');
|
||
HASH_CACHE[cache_key] = final_hash;
|
||
resolve(final_hash);
|
||
});
|
||
}) as Promise<string>
|
||
}
|
||
|
||
async function getDirSha256(dir_path, {pwd=null, withRoot=true, filter=null, maxdepth=-1, subfiles=null}={}) {
|
||
// console.log('CALCULATING SHA256 OF FILES IN DIR', dir_path, {withRoot, filter, maxdepth})
|
||
// dir_path: path absolute or relative path of the directory you want the merkle sha256 for
|
||
// pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to
|
||
// withRoot: bool include a summary entry for the root dir_path dir in the list as '.'
|
||
// filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
|
||
// maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
|
||
// subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
|
||
|
||
pwd = pwd || dir_path
|
||
if (!dir_path.startsWith(pwd)) {
|
||
dir_path = path.join(pwd, dir_path)
|
||
}
|
||
|
||
const dirent = await fs.promises.stat(dir_path)
|
||
assert(dirent.isDirectory(), `Tried to compute sha256 of path but missing or not a directory! ${dir_path}`)
|
||
assert((maxdepth >= -1), `maxdepth must be -1, 0, or 1, 2, 3, etc... (got ${maxdepth})`)
|
||
|
||
// assert(!(filter && withRoot), `Cannot generate root hash (consistently) when a custom filter is provided!`)
|
||
|
||
// get the sha256 of every file in a directory recursively (excluding hidden files and symlinks)
|
||
// EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum
|
||
const all_subfiles = (subfiles as string[]) || await getDirEntries(dir_path, {
|
||
pwd,
|
||
recursive: true,
|
||
includeFiles: true,
|
||
includeDirs: false,
|
||
|
||
// ~~maxdepth,~~ // dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir hashes.
|
||
// it never makes sense to ignore subfiles beyond a certain depth for hash calculation. Hashes are
|
||
// only useful IDs if they are consistent+repeatable, hashing to an arbitrary depth will produce
|
||
// many different hashes for the same directory, which is not something we need/want polluting the hash space.
|
||
|
||
|
||
filter, // we do however allow passing a manual filter funcs which does actually affect the hash
|
||
// this is useful to allow quick checks to see whether a certain subset of files has changed or not
|
||
})
|
||
const hashes: {[key: string]: string} = {}
|
||
let hashable_summary_str = ''
|
||
for (const subfile of all_subfiles) {
|
||
// {'versions/20240413144307/screen recording.mp4': '1df4d9c3aca8b36f1f73e327d56038f80a35db407a298edb16c72576d7dd894e', ...}
|
||
hashes[subfile] = await sha256File(subfile, {pwd})
|
||
const relpath = path.relative(await fs.promises.realpath(dir_path), await fs.promises.realpath(path.join(pwd, subfile)))
|
||
hashable_summary_str += `${hashes[subfile]} ./${relpath}\n`
|
||
}
|
||
// console.log('CALCULATED HASHES FOR ALL SUBFILES IN DIR', dir_path, Object.keys(hashes).length)
|
||
|
||
// get list of subdirectories and recursively hash every subdirectory
|
||
// EQUIVALENT TO: find . -type d -not -path '*/.*' -maxdepth ${maxdepth} -print | sort
|
||
const subdirs = await getDirEntries(dir_path, {pwd, recursive: true, includeHidden: false, includeDirs: true, includeFiles: false, filter, maxdepth})
|
||
|
||
// for each subdirectory, get its hash recursively and store it in the hash list
|
||
for (const subdir of subdirs) {
|
||
// console.log('GETTING SUBDIR HASH', subdir)
|
||
// a directory's hash is defined as the hash of all the *files* within (excluding dirs/symlinks/hidden)
|
||
const subdir_hashes = await getDirSha256(
|
||
subdir,
|
||
{pwd, withRoot: true, filter, maxdepth: 0},
|
||
)
|
||
hashes[subdir] = subdir_hashes['.']
|
||
}
|
||
// console.log('CALCULATED HASHES FOR ALL SUBDIRS IN DIR', dir_path, subdirs.length)
|
||
|
||
// filter results if maxdepth is provided
|
||
if (maxdepth >= 0) {
|
||
for (const subpath of Object.keys(hashes)) {
|
||
if (pathDepth(subpath) > maxdepth) {
|
||
delete hashes[subpath]
|
||
}
|
||
}
|
||
}
|
||
// console.log('LIMITED OUTPUT DUE TO MAXDEPTH', maxdepth, Object.keys(hashes).length)
|
||
|
||
// calculate the hash of the root '.' folder by hashing all of hashes of its contents
|
||
// EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum
|
||
if (withRoot) {
|
||
// pass the first command's output containing the file list + hashes into another sha256
|
||
// to get the final hash of the whole directory combined
|
||
// console.log('CALCULATING FINAL ROOT HASH for ', dir_path)
|
||
// console.log(hashable_summary_str)
|
||
hashes['.'] = crypto.createHash('sha256').update(hashable_summary_str).digest('hex') as string
|
||
// console.log('--->', hashes['.'])
|
||
}
|
||
|
||
return hashes
|
||
}
|
||
|
||
|
||
async function getDirInfo(dir_path, {pwd=null, withRoot=true, withHelpers=true, filter=null, maxdepth=-1, subfiles=null}={}) {
|
||
// get a detailed JSON/dumpable index of a directory's contents, w/ merkle sha256's, sizes, and mimeTypes
|
||
// dir_path: path absolute or relative path of the directory you want size info for
|
||
// pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to
|
||
// withRoot: bool include a summary entry for the root dir_path dir in the list as '.'
|
||
// withHelpers: bool attach many extra helper attrs/funcs to results (beyond JSON-serializable core data)
|
||
// filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
|
||
// maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
|
||
// subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
|
||
|
||
// {
|
||
// ...
|
||
// 'example.txt': { ... },
|
||
// 'foobar/example.mp3': { ... },
|
||
// '.': { // this is the fully agumented result when withHelpers=true
|
||
// is_file: false,
|
||
// is_dir: true,
|
||
// filename: '.',
|
||
// basename: '1709039915.378868',
|
||
// mimeType: 'inode/directory'
|
||
// extension: undefined,
|
||
// num_bytes: 11540961,
|
||
// num_subpaths: 15,
|
||
// sha256: '9fc58b3ed887e7139338062ebd49bd6795373759e8acb73d2f7a40f1413789da',
|
||
// reldepth: 1,
|
||
// relpath: './',
|
||
// cwd: '/opt/archivebox/data/archive/1709039915.378868/',
|
||
// dirname: '/opt/archivebox/data/archive',
|
||
// abspath: '/opt/archivebox/data/archive/1709039915.378868',
|
||
// dirent: Stats {
|
||
// dev: 16777240,
|
||
// mode: 16895,
|
||
// uid: 501,
|
||
// ...
|
||
// mtimeMs: 1717160622956.1357,
|
||
// ctimeMs: 1717160622956.1357,
|
||
// },
|
||
// created: '2024-05-31T13:03:42.956Z',
|
||
// modified: '2024-05-31T13:03:42.956Z',
|
||
// summary: './data/archive/1709039915.378868 (inode/directory 11541kb 9fc58b3e)',
|
||
// helptext: 'Verify these hashes by running:\n' +
|
||
// ' cd /opt/archivebox/data/archive/1709039915.378868 \n' +
|
||
// " find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum",
|
||
// },
|
||
// }
|
||
|
||
pwd = pwd || dir_path
|
||
if (!dir_path.startsWith(pwd)) {
|
||
dir_path = path.join(pwd, dir_path)
|
||
}
|
||
|
||
// calculate hashes and sizes recursively
|
||
const hashes = await getDirSha256(dir_path, {pwd, withRoot, filter, maxdepth, subfiles})
|
||
const sizes = await getDirSizes(dir_path, {pwd, withRoot, filter, maxdepth, subfiles})
|
||
|
||
const num_total_subpaths = Object.keys(hashes).filter(name => name !== '.').length
|
||
|
||
const details = {}
|
||
for (const [filename, sha256] of Object.entries(hashes)) {
|
||
if (filename === '.' && !withRoot) continue
|
||
|
||
const abspath = await fs.promises.realpath(path.join(dir_path, filename))
|
||
const dirent = await fs.promises.stat(abspath)
|
||
const num_subpaths = Object.keys(hashes).filter(subpath => subpath.startsWith(filename + '/')).length
|
||
const is_file = dirent.isFile()
|
||
const is_dir = dirent.isDirectory()
|
||
|
||
// bare-bones info suitable for JSON dumps/exports
|
||
const basic_info = {
|
||
sha256,
|
||
num_bytes: sizes[filename],
|
||
created: (new Date(dirent.ctimeMs)).toISOString(),
|
||
mimeType: undefined,
|
||
extension: undefined,
|
||
num_subpaths: undefined,
|
||
}
|
||
if (is_dir) {
|
||
basic_info.mimeType = 'inode/directory'
|
||
basic_info.extension = undefined
|
||
basic_info.num_subpaths = (filename === '.') ? num_total_subpaths : num_subpaths
|
||
}
|
||
if (is_file) {
|
||
basic_info.mimeType = mime.lookup(abspath) || null
|
||
basic_info.extension = path.extname(filename)
|
||
basic_info.num_subpaths = undefined
|
||
}
|
||
|
||
// extra helpers suitable for usage in other areas of the codebase
|
||
const info_with_helpers = {
|
||
...basic_info,
|
||
filename,
|
||
basename: path.basename(abspath),
|
||
dirname: path.dirname(abspath),
|
||
cwd: dir_path,
|
||
relpath: is_dir ? (filename + '/') : filename,
|
||
reldepth: pathDepth(filename),
|
||
abspath,
|
||
is_file,
|
||
is_dir,
|
||
dirent,
|
||
modified: (new Date(dirent.mtimeMs)).toISOString(),
|
||
summary: `${prettyPath(abspath)} (${basic_info.mimeType} ${Math.round(basic_info.num_bytes/1000)}kb ${sha256.substring(0, 8)})`,
|
||
helptext: undefined,
|
||
}
|
||
if (filename === '.') {
|
||
info_with_helpers.helptext = `Verify these hashes by running:\n cd ${prettyPath(abspath)} \n find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum`
|
||
}
|
||
|
||
if ((typeof filter) === 'function') {
|
||
if (!filter(info_with_helpers)) continue
|
||
}
|
||
|
||
details[filename] = withHelpers ? info_with_helpers : basic_info
|
||
}
|
||
return details
|
||
}
|
||
|
||
// console.log(await getDirSha256(
|
||
// '/opt/archivebox/data/archive/1709039915.378868/',
|
||
// {
|
||
// withRoot: true,
|
||
// maxdepth: -1,
|
||
// filter: ({relpath}) => relpath.startsWith('versions'),
|
||
// },
|
||
// ))
|
||
// console.log(await getDirSizes(
|
||
// '/opt/archivebox/data/archive/1709039915.378868/',
|
||
// {
|
||
// withRoot: false,
|
||
// maxdepth: 2,
|
||
// filter: ({relpath}) => !relpath.startsWith('versions'),
|
||
// },
|
||
// ))
|
||
// console.log(await getDirInfo(
|
||
// '/opt/archivebox/data/archive/1709039915.378868/',
|
||
// {
|
||
// withRoot: true,
|
||
// withHelpers: true,
|
||
// maxdepth: 1,
|
||
// // filter: ({relpath}) => relpath.startsWith('versions'),
|
||
// },
|
||
// ))
|
||
|
||
type DetectFilenameOptions = {
|
||
url?: string,
|
||
response?: HTTPResponse | Response,
|
||
page?: Page,
|
||
dir?: string,
|
||
abspath?: string,
|
||
filename?: string,
|
||
basename?: string,
|
||
extension?: string,
|
||
mimeType?: string,
|
||
resourceType?: string,
|
||
}
|
||
|
||
async function detectFilename({ url, response, page, dir, abspath, filename, basename, extension, mimeType, resourceType }: DetectFilenameOptions) {
|
||
// this function takes a url (and/or response/page), and detects the abspath,dir,filename,basename,extention,mimeType
|
||
// from the URL (+ any enforced path components passed in via args)
|
||
// example: detectFilename({url: 'https://example.com/favicon.png', extension: 'ico'}) outputs 'favicon.ico'
|
||
//
|
||
// it has some quirks that are specific to archiving and may not behave as you expect
|
||
// e.g. if visiting the url https://example.com/error.zip returns a 500 text/html error page
|
||
// this may still save it as a .zip with mimeType=application/x-zip and ignore the response mimeType the url ends in .zip
|
||
// however, if the url has no extension, e.g. https://example.com/error it will
|
||
// auto-detect the mimeType based on the response and append an extension, saving as error.html
|
||
//
|
||
// ⚠️ SECURITY WARNING: think carefully about the permissions, shell injection, and RCE implications of any changes made here ⚠️
|
||
// this function writes untrusted web content to the filesystem using auto-detected mimetype to co-erce the extension,
|
||
// which can be dangerous (e.g. what if one of these downloads is a malicious ransomware .exe, do we really want to give it .exe?
|
||
// if we do, how do we make sure it never gets executed? (without damaging the integrity of the copy)
|
||
|
||
if (!(response || page)) throw 'Either a page or a response must be provided in order to detect mimeType & URL'
|
||
|
||
if (response && (typeof response.headers !== 'function')) {
|
||
const node_fetch_response: Response = response as Response
|
||
response = {
|
||
url: () => node_fetch_response.url,
|
||
headers: () => node_fetch_response.headers,
|
||
} as unknown as HTTPResponse
|
||
}
|
||
response = response as HTTPResponse
|
||
|
||
url = url || response?.url() || (await page.url())
|
||
if (!url) throw 'URL was not provided and could not be detected from {response, page}'
|
||
|
||
// Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other
|
||
try {
|
||
resourceType = resourceType || response?.request()?.resourceType()
|
||
} catch(err) {
|
||
// ignore, sometimes response is null/not available
|
||
}
|
||
const resourceTypeToMimeType = {
|
||
'Stylesheet': 'text/css',
|
||
'Script': 'application/x-javascript',
|
||
'WebSocket': 'application/json',
|
||
'Website': 'text/html',
|
||
}
|
||
|
||
mimeType = mimeType || resourceTypeToMimeType[resourceType] // guess extension based on request resourceType
|
||
extension = extension || (mimeType ? mime.extension(mimeType) : null)
|
||
|
||
// handle special url cases (e.g. schemes in URL_SCHEMES_IGNORED)
|
||
if (url.startsWith('about:blank')) {
|
||
filename = 'about_blank'
|
||
mimeType = 'text/html'
|
||
}
|
||
else if (url.startsWith('data:')) {
|
||
filename = `data__${hashCode(url)}`
|
||
}
|
||
|
||
// console.log('detectFilename>', {url, dir, abspath, filename, basename, extension, mimeType, resourceType})
|
||
|
||
if (abspath) {
|
||
if (dir || filename || basename || extension)
|
||
throw '{abspath} should not be passed with other options (e.g. dir, filename, basename, extension)'
|
||
var {dir, base: filename, ext: extension, name: basename} = path.parse(abspath)
|
||
// path.parse('/home/user/dir/file.txt') returns:
|
||
// { root: '/',
|
||
// dir: '/home/user/dir',
|
||
// base: 'file.txt',
|
||
// ext: '.txt',
|
||
// name: 'file' }
|
||
} else {
|
||
dir = dir || path.resolve(process.cwd())
|
||
|
||
filename = filename // https://example.com/a.1.zip?e.pdf=2#g.h=3 => a.1.zip
|
||
|| (new URL(url)).pathname.split('/').at(-1) // https://example.com/file124.rss => file124.rss prefers last component of path with no query/hash, falls back to domain name if no path
|
||
|| 'index' // https://example.com/abc/def/ => index.html
|
||
//|| (new URL(url)).hostname.replaceAll('.', '_') // https://example.com => example_com (but if disabled, this would be index.html)
|
||
}
|
||
if (!filename) throw 'filename/abspath were not passed and could not be detected from url'
|
||
|
||
const path_extname = path.extname(filename)
|
||
const resp_mimetype = response && (
|
||
(response as any).mimeType
|
||
|| response.headers()['content-type']?.split(';')[0]
|
||
|| resourceTypeToMimeType[resourceType]
|
||
|| 'application/octet-stream'
|
||
)
|
||
|
||
mimeType = mimeType // https://example.com/a.1.zip?e.pdf=2#g.h=3 => application/x-zip prefers mimetype based on extension in path, falls back to response mimeType
|
||
|| (path_extname && mime.lookup(path_extname)) // https://example.com/file124.rss => application/rss+xml
|
||
|| resp_mimetype // https://example.com/get?type=png => image/png
|
||
|
||
extension = extension
|
||
|| (path_extname && path_extname.replace('.', '')) // https://example.com/a.1.zip?e.pdf=2#g.h=3 => zip prefers extension in path, falls back to response mimeType's suggested extension
|
||
|| (resp_mimetype && mime.extension(resp_mimetype)) // https://example.com => html
|
||
|| '' // https://example.com/websocket.1 =>
|
||
if (extension.startsWith('.'))
|
||
extension = extension.slice(1)
|
||
|
||
basename = basename // https://example.com/a.1.zip?e.pdf=2#g.h=3 => a.1 prefers to filename in path (without extension), falls back to domain name
|
||
|| (path.parse(filename).name) // https://mp4dl.example.com => mp4dl_example_com
|
||
|
||
basename = basename.slice(0, 120) // truncate at 120 characters (leaving 8 chars for .ext)
|
||
basename = basename.replace(/[^a-zA-Z0-9%+?&=@;_ \.-]/g, '') // strip characters not allowed in filenames
|
||
|
||
filename = basename + '.' + extension
|
||
|
||
if (filename.endsWith('.'))
|
||
filename = filename.slice(0, -1)
|
||
|
||
abspath = abspath || path.join(dir, filename)
|
||
|
||
// console.log('detectFilename<', {url, dir, abspath, filename, basename, extension, mimeType, resourceType})
|
||
|
||
return {
|
||
url,
|
||
dir,
|
||
abspath,
|
||
filename,
|
||
basename,
|
||
extension,
|
||
mimeType,
|
||
resourceType,
|
||
resp_mimetype,
|
||
}
|
||
}
|
||
|
||
interface DowloadOptions extends DetectFilenameOptions {
|
||
browser?: Browser
|
||
expected_mimetype?: string
|
||
timeout?: number
|
||
}
|
||
|
||
async function download({ url, browser, page, response, dir, abspath, filename, basename, extension, expected_mimetype, timeout }: DowloadOptions) {
|
||
url = url || (response as HTTPResponse)?.url() || (await page?.url())
|
||
ALREADY_ARCHIVED.add(url.slice(0, 4096)) // prevent running whole archive task on tabs we create for just for downloading
|
||
|
||
browser = browser || (page && (await page.browser()))
|
||
timeout = timeout || 120_000
|
||
expected_mimetype = expected_mimetype || ''
|
||
let newPage = null
|
||
let errors = []
|
||
let num_bytes = 0
|
||
let bytesBuffer = null
|
||
|
||
|
||
// if we need to fetch the url (i.e. it's not already been requested)
|
||
if (!response) {
|
||
if (!browser) throw 'No {browser} or {page} was provided to download with'
|
||
newPage = await browser.newPage()
|
||
if (page) await page.bringToFront() // if origin page is provided, make sure it stays in foreground
|
||
response = await newPage.goto(url, {timeout: timeout, waitUntil: 'networkidle0'})
|
||
if (page) await page.bringToFront() // if origin page is provided, make sure it stays in foreground
|
||
}
|
||
url = url || (response as HTTPResponse)?.url() || (await newPage?.url()) || (await page?.url());
|
||
const response_mimetype = (response as HTTPResponse).headers()['content-type']?.split(';')[0] || 'text/html'
|
||
|
||
// detect the filename we should write to based on provided url/response/page/filename/extension suggestions
|
||
var {
|
||
dir,
|
||
abspath,
|
||
filename,
|
||
basename,
|
||
extension,
|
||
mimeType,
|
||
} = await detectFilename({url, page, response, dir, abspath, filename, basename, extension, mimeType})
|
||
|
||
// if mimeType is passed, make sure response matches expected mimetype, otherwise consider download a failure
|
||
if (!response_mimetype.startsWith(expected_mimetype)) {
|
||
errors.push(`Expected ${expected_mimetype} but got ${response_mimetype}`)
|
||
} else {
|
||
|
||
// download the file using puppeteer's response.buffer()
|
||
try {
|
||
// write the response bytes into the output file
|
||
bytesBuffer = await (response as HTTPResponse).buffer()
|
||
await overwriteFile(abspath, bytesBuffer)
|
||
num_bytes = bytesBuffer.length
|
||
} catch(err) {
|
||
errors.push(err)
|
||
}
|
||
|
||
// security check to make sure downloaded file is not executable (random binaries downloaded off the internet = dangerous)
|
||
fs.access(abspath, fs.constants.X_OK, (err) => {
|
||
if (!err) console.warn(
|
||
'[⚠️] SECURITY WARNING: Downloaded file appears to be executable:', prettyPath(abspath),
|
||
'\n (be careful running untrusted programs downloaded from the internet!)'
|
||
)
|
||
})
|
||
}
|
||
|
||
// if we opened a dedicated page for downloading, close it now
|
||
if (newPage) {
|
||
newPage.close()
|
||
}
|
||
|
||
if (errors.length) {
|
||
// console.warn(`[❌] Downloading ${url} (${mimeType}) to ${abspath} failed:`, JSON.stringify(errors, null, 4))
|
||
} else {
|
||
console.log(`[💾] Downloaded ${url.substring(0, 40)} (${num_bytes} ${mimeType})...`.padEnd(82), prettyPath(abspath))
|
||
}
|
||
|
||
return {
|
||
url, response, errors,
|
||
dir, abspath, filename, basename, extension, mimeType,
|
||
bytesBuffer, num_bytes,
|
||
}
|
||
}
|
||
|
||
|
||
/************************** Puppeteer Launching *******************************/
|
||
|
||
|
||
async function startCluster(puppeteer, args=CHROME_ARGS_DEFAULT) {
|
||
console.log(`[🎭] Launching ${CHROME_CLUSTER_WORKERS}x Chromium browsers with puppeteer-cluster:`.padEnd(82), prettyPath(CHROME_PROFILE_PATH))
|
||
const cluster = await Cluster.launch({
|
||
puppeteer,
|
||
monitor: true,
|
||
maxConcurrency: CHROME_CLUSTER_WORKERS,
|
||
sameDomainDelay: 2550,
|
||
workerCreationDelay: 250,
|
||
timeout: 300_000, // total ms timeout for an entire task (1000ms * 60s * 5m)
|
||
concurrency: Cluster.CONCURRENCY_PAGE, // share cookies between all tabs in a given browser
|
||
puppeteerOptions: {
|
||
args, // all the chrome launch CLI args
|
||
ignoreDefaultArgs: true, // trust me, we have enough args already...
|
||
// dumpio: true, // full debug log output, super noisy
|
||
}
|
||
})
|
||
console.log('*************************************************************************')
|
||
return cluster
|
||
}
|
||
|
||
async function remoteBrowser(puppeteer, {browserURL, browserWSEndpoint}) {
|
||
console.log('[🎭] Connecting Puppeteer to existing Chromium browser via:', browserURL || browserWSEndpoint)
|
||
let completed_initial_connection = false
|
||
const browser = await puppeteer.connect({browserURL, browserWSEndpoint, defaultViewport: null, targetFilter: () => completed_initial_connection})
|
||
completed_initial_connection = true
|
||
console.log('*************************************************************************')
|
||
return browser
|
||
}
|
||
|
||
async function startBrowser(puppeteer, args=CHROME_ARGS_DEFAULT) {
|
||
console.log('[🎭] Launching Puppeteer Chromium browser...'.padEnd(82+1), prettyPath(CHROME_PROFILE_PATH))
|
||
|
||
const browser = await puppeteer.launch({ignoreDefaultArgs: true, args, dumpio: true})
|
||
globalThis.browser = browser
|
||
console.log('*************************************************************************')
|
||
|
||
// store all active tabs on global var by url for easier vscode interactive debugging
|
||
const storeTabForDebugger = async (target) => {
|
||
try {
|
||
globalThis.tabs = globalThis.tabs || {}
|
||
const url = target.url()
|
||
const page = await target.page()
|
||
if (!page || page?.isClosed()) {
|
||
delete globalThis.tabs[url]
|
||
} else {
|
||
globalThis.tab = page
|
||
globalThis.tabs[url] = page
|
||
}
|
||
} catch(err) {console.warn(err)}
|
||
}
|
||
browser.on('targetcreated', storeTabForDebugger)
|
||
browser.on('targetchanged', storeTabForDebugger)
|
||
browser.on('targetdestroyed', storeTabForDebugger)
|
||
|
||
// wait for initial extension background.js/service worker targets to load
|
||
await wait(3_000)
|
||
|
||
// prime the extensions cache
|
||
const extensions = await getChromeExtensionsFromCache({browser})
|
||
globalThis.extensions = extensions // for easier debugging only
|
||
|
||
// give the user 2min to check any issues with the initial startup pages (bot profile pages),
|
||
// solve captchas, re-login, etc. then close them after that to save resources
|
||
const startup_pages = (await browser.pages())
|
||
const startup_page_close_delay = 120_000
|
||
setTimeout(async () => {
|
||
for (const page of startup_pages) {
|
||
try { await page.close() } catch(err) { /* page may already be closed by now, which is fine */ }
|
||
}
|
||
|
||
}, startup_page_close_delay)
|
||
|
||
// setup any extensions that need final runtime configuration using their options pages
|
||
// await setup2CaptchaExtension({browser, extensions})
|
||
|
||
// open a placeholder page so browser window stays open when there are no active archiving pages
|
||
// (it's annoying to have the entire window open/close/open/close/etc every time an archive task runs)
|
||
const empty_page = await browser.newPage()
|
||
await wait(250)
|
||
await empty_page.goto('chrome://version')
|
||
await wait(500)
|
||
console.log('*************************************************************************')
|
||
|
||
return browser
|
||
}
|
||
|
||
async function startAPIServer(port=API_SERVER_PORT, host=API_SERVER_HOST, taskCallback=null) {
|
||
// taskCallback should be an async function that takes ({url}) => and does something with it
|
||
assert(taskCallback && (typeof taskCallback === 'function'))
|
||
|
||
const server = createServer(async (req, res) => {
|
||
if (req.method === 'POST') {
|
||
console.log(`[API][POST] ${req.url}`)
|
||
let body = '';
|
||
|
||
req.on('data', (chunk) => {
|
||
body += chunk;
|
||
});
|
||
|
||
req.on('end', () => {
|
||
try {
|
||
const jsonData = JSON.parse(body);
|
||
// Process the JSON data
|
||
console.log(jsonData);
|
||
|
||
res.writeHead(200, { 'Content-Type': 'application/json' });
|
||
res.end(JSON.stringify({ message: 'JSON data received' }));
|
||
} catch (error) {
|
||
res.writeHead(400, { 'Content-Type': 'application/json' });
|
||
res.end(JSON.stringify({ error: 'Invalid JSON data' }));
|
||
}
|
||
});
|
||
} else if (req.method === 'GET') {
|
||
console.log(`[API][GET] ${req.url}`)
|
||
const parsedUrl = new URL(`http://${host}:${port}${req.url}`)
|
||
const query = new URLSearchParams(parsedUrl.search);
|
||
const url = query.get('url');
|
||
if (url && url.includes('://')) {
|
||
res.writeHead(200, { 'Content-Type': 'text/plain' });
|
||
try {
|
||
await taskCallback({url})
|
||
res.end(`${url}\n${TASK_PATH(url)}`);
|
||
} catch(err) {
|
||
res.end(`${url}\n${TASK_PATH(url)}\n${err}`);
|
||
}
|
||
} else {
|
||
res.writeHead(500, { 'Content-Type': 'text/plain' });
|
||
res.end(`Bad URL: ${url}\n\nExpected: /?url=https://example.com/url/to/archive`);
|
||
}
|
||
} else {
|
||
res.writeHead(405, { 'Content-Type': 'application/json' });
|
||
res.end(JSON.stringify({ error: 'Method not allowed' }));
|
||
}
|
||
})
|
||
|
||
server.listen(port, host, () => {
|
||
console.log(`[🎰] API Server listening for requests on http://${host}:${port}/?url=...`);
|
||
})
|
||
console.log('*************************************************************************')
|
||
|
||
return server
|
||
}
|
||
|
||
async function main(urls, cluster=CHROME_CLUSTER) {
|
||
process.chdir(DATA_DIR)
|
||
|
||
const extensions = await getChromeExtensionsFromPersona({CHROME_EXTENSIONS, CHROME_EXTENSIONS_DIR})
|
||
const args = getChromeArgs({...CHROME_LAUNCH_OPTIONS, CHROME_EXTENSIONS: extensions})
|
||
const preferences = getChromePreferences({CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, CHROME_DOWNLOADS_DIR, CHROME_EXTENSIONS: extensions})
|
||
const Puppeteer = applyChromePreferences(PupeteerExtra, CHROME_PREFERENCES_PATH, preferences)
|
||
|
||
Puppeteer.use(StealthPlugin());
|
||
// Puppeteer.use(ReplPlugin());
|
||
// handled by uBlock Origin & ReCaptcha browser extensions, probably not needed here anymore:
|
||
// Puppeteer.use(RecaptchaPlugin({
|
||
// provider: {id: '2captcha', token: API_KEY_2CAPTCHA},
|
||
// visualFeedback: true,
|
||
// }))
|
||
// const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
|
||
// puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
|
||
|
||
if (cluster) {
|
||
// launch browser with multiple tabs w/ puppeteer
|
||
const cluster = await startCluster(Puppeteer, args)
|
||
|
||
const handleTask = async ({url}) => cluster.queue(url, botArchiveTask)
|
||
const server = await startAPIServer(API_SERVER_PORT, API_SERVER_HOST, handleTask)
|
||
|
||
console.log('[📋] Running tasks in parallel with puppeteer cluster...')
|
||
for (const url of urls) {
|
||
if (fs.existsSync(path.join(TASK_PATH(url), 'aiqa.json'))) {
|
||
try {
|
||
JSON.parse((await fs.promises.readFile(path.join(TASK_PATH(url), 'aiqa.json'))).toString())
|
||
console.log(' skipping (already present):', TASK_PATH(url), url)
|
||
continue
|
||
} catch(err) {
|
||
// pass
|
||
}
|
||
}
|
||
cluster.queue(url, botArchiveTask)
|
||
await wait(3_000)
|
||
}
|
||
|
||
await cluster.idle();
|
||
await cluster.close();
|
||
} else {
|
||
// launch single new browser w/ puppeter / connect to remote CDP browser w/ puppeteer
|
||
const browser = await startBrowser(Puppeteer, args)
|
||
// const browser = await remoteBrowser(Puppeteer, {browserURL, browserWSEndpoint})
|
||
|
||
// run speedtest in the background
|
||
speedtest({browser})
|
||
|
||
const handleTask = async ({url}) => await botArchiveTask({page: (await browser.newPage()), data: url})
|
||
const server = await startAPIServer(API_SERVER_PORT, API_SERVER_HOST, handleTask)
|
||
|
||
// wait for any pre-run setup tasks or server requests
|
||
await wait(5_000)
|
||
|
||
let num_succeeded = 0
|
||
let num_failed = 0
|
||
|
||
console.log(`[📋] Running ${urls.length} tasks sequentially with puppeteer browser...`)
|
||
for (const url of urls) {
|
||
const run_count = (num_succeeded + num_failed) || 1
|
||
|
||
// check if task should be run or skipped based on existing snapshot data present in directory
|
||
const metrics_path = path.join(TASK_PATH(url), 'metrics.json')
|
||
const screenshot_path = path.join(TASK_PATH(url), 'screenrecording.gif')
|
||
const aiqa_path = path.join(TASK_PATH(url), 'aiqa.json')
|
||
const versions_path = path.join(TASK_PATH(url), 'versions')
|
||
if (fs.existsSync(metrics_path) && fs.existsSync(screenshot_path) && fs.existsSync(aiqa_path) && fs.existsSync(versions_path)) {
|
||
try {
|
||
const ai_qa_result = JSON.parse(await fs.promises.readFile(aiqa_path, 'utf-8'))
|
||
console.log(prettyPath(TASK_PATH(url)), `${ai_qa_result.pct_visible}%`, ai_qa_result.website_brand_name, url.substring(0, 80))
|
||
assert(ai_qa_result.website_brand_name)
|
||
continue
|
||
} catch(err) {
|
||
// pass
|
||
}
|
||
}
|
||
let delay = 0
|
||
|
||
// create a new browser page and run the archiving task
|
||
const page = (await browser.newPage())
|
||
try {
|
||
console.log(ANSI.black + `◤==============================================================================[${String(run_count).padStart(3)}]/[${urls.length}]◥` + ANSI.reset)
|
||
await botArchiveTask({page, data: url})
|
||
delay = 1_000
|
||
num_succeeded += 1
|
||
} catch(err) {
|
||
console.error('[❌] Archiving task failed!', url)
|
||
console.error(err)
|
||
num_failed += 1
|
||
delay = 15_000 // extra delay if there are errors
|
||
}
|
||
console.log(ANSI.black + `◣==============================================================================[☑ ${num_succeeded}][🆇 ${num_failed}]◢` + ANSI.reset)
|
||
|
||
// check for abnormally high failure rates and exit early if needed
|
||
const failure_pct = Math.round((num_failed/run_count) * 100)
|
||
if (failure_pct > 50) {
|
||
if (run_count > 5) {
|
||
console.warn(`[⚠️] ${failure_pct}% Task failure rate is very high! Will self-cancel after 10 URLs if >50% continue to fail...`)
|
||
}
|
||
if (run_count > 10) {
|
||
throw `Too many tasks failed in a row! Quitting early after ${run_count}/${urls.length} tasks.`
|
||
}
|
||
}
|
||
|
||
// increase the delay between tasks based on the ratio of how many are failing:succeeding
|
||
delay = Math.pow(4, (num_failed/(num_succeeded + 3))) * delay
|
||
// e.g. 0:1 failure ratio == 1 * delay == 1 ~ 15s
|
||
// 1:1 failure ratio == 5 * delay == 5 ~ 1m ... 5^(failed:succeeded) exponential increase
|
||
// 2:1 failure ratio == 25 * delay == 25s ~ 6m
|
||
// 3:1 failure ratio == 125 * delay == 2m ~ 31m
|
||
// etc...
|
||
// up to 1hr+
|
||
delay = Math.min(delay, 3_600_000) // 1hr maximum delay between tasks
|
||
delay = Math.max(delay, 1_000) // 1s minimum delay between tasks
|
||
if (delay > 2_500) {
|
||
console.log('... waiting', Math.round(delay/1000), 'seconds (self rate-limit)...')
|
||
}
|
||
await wait(delay) // base ratelimit
|
||
console.log()
|
||
}
|
||
|
||
|
||
if (PASSIVE_ARCHIVING) {
|
||
// replace these as-needed:
|
||
const browserURL = 'http://localhost:9222/'
|
||
const browserWSEndpoint = 'ws://localhost:9222/devtools/browser'
|
||
|
||
const driver_browser = browser || await remoteBrowser(Puppeteer, {browserURL, browserWSEndpoint})
|
||
const archiver_browser = {} //await startBrowser(Puppeteer, args)
|
||
|
||
const extensions = await getChromeExtensionsFromCache({browser: driver_browser})
|
||
|
||
// close both browsers if either one is closed
|
||
let browser_is_open = true
|
||
driver_browser.on('disconnected', async () => {browser_is_open = false}) // await archiver_browser.close()
|
||
// archiver_browser.on('disconnected', async () => {browser_is_open = false; await driver_browser.close()})
|
||
|
||
// handle any tab navigation to a new URL in the driver browser
|
||
const handleUserNavigation = async (target) => {
|
||
const url = target.url()
|
||
const page = await target.page()
|
||
// const client = await target.createCDPSession()
|
||
|
||
if (target.type() == 'page' && page && url) {
|
||
console.log(ANSI.black + '==============================================================================' + ANSI.reset)
|
||
console.warn('[➕] DRIVER BROWSER NAVIGATED:', ANSI.blue, url, ANSI.reset)
|
||
|
||
try {
|
||
await passiveArchiveTask({browser: driver_browser, page, url})
|
||
await wait(3_000)
|
||
} catch(err) {
|
||
console.error('[❌] Archiving task failed!', url)
|
||
console.error(err)
|
||
await wait(10_000) // base ratelimit
|
||
}
|
||
console.log(ANSI.black + '==============================================================================' + ANSI.reset)
|
||
// await client.send('Page.enable')
|
||
// await client.send('Page.setWebLifecycleState', {state: 'active'})
|
||
}
|
||
// await client.send('Runtime.runIfWaitingForDebugger')
|
||
}
|
||
|
||
// setup handler to archive new page whenever one is opened
|
||
driver_browser.on('targetcreated', handleUserNavigation)
|
||
driver_browser.on('targetchanged', handleUserNavigation)
|
||
|
||
console.log('------------------------------------------------------')
|
||
console.log('[👀] Waiting for browser tabs to be opened by human...')
|
||
while (browser_is_open) {
|
||
await wait(2_000)
|
||
}
|
||
} else {
|
||
while (true) {
|
||
await wait(2_000)
|
||
}
|
||
}
|
||
|
||
await browser.close()
|
||
}
|
||
console.log('[✅] Finished all tasks and stopped browsers.')
|
||
process.exit(0);
|
||
}
|
||
|
||
|
||
/******************************************************************************/
|
||
if (import.meta.main) {
|
||
main(URLS).catch(console.error);
|
||
}
|
||
|
||
/******************************************************************************/
|
||
|
||
// if we want to handle CLI args in the future, minimist is great:
|
||
// var argv = require('minimist')(process.argv.slice(2));
|
||
// console.log(argv); // --url=https://example.com --binpath=/browsers/chromium-1047/bin/chromium --datadir=/Chromium
|
||
// const {url, binpath, datadir} = argv;
|
||
|
||
|
||
// OLD CODE, may be useful in the future if we need audio in screenrecordings:
|
||
// async function setupScreenrecordingWithAudio(page, wss) {
|
||
// console.log('[🎬] Setting up screen-recording plugin...');
|
||
// const stream_port = (await wss).options.port;
|
||
// // streamPage = await (page.browser()).newPage()
|
||
// await page.goto(`chrome-extension://jjndjgheafjngoipoacpjgeicjeomjli/options.html#${stream_port}`)
|
||
//
|
||
// // puppeteer-stream recording start
|
||
// streamFile = fs.createWriteStream(SCREENRECORDING_PATH(page))
|
||
// stream = await getStream(page, {
|
||
// audio: true,
|
||
// video: true,
|
||
// bitsPerSecond: 8000000, // 1080p video
|
||
// });
|
||
// stream.pipe(streamFile);
|
||
// return {stream, streamFile}
|
||
//
|
||
// // puppeteer-stream recording stop & cleanup
|
||
// if (stream && streamFile) {
|
||
// await stream?.destroy();
|
||
// streamFile?.close();
|
||
// // await streamPage.close();
|
||
// }
|
||
// }
|
||
|