-
-
Notifications
You must be signed in to change notification settings - Fork 10.7k
Expand file tree
/
Copy pathcheck-liveness.mjs
More file actions
93 lines (79 loc) · 3.47 KB
/
Copy pathcheck-liveness.mjs
File metadata and controls
93 lines (79 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env node
/**
* check-liveness.mjs — Playwright job link liveness checker
*
* Tests whether job posting URLs are still active or have expired.
* Uses the same detection logic as scan.md step 7.5.
* Zero Claude API tokens — pure Playwright.
*
* Usage:
* node check-liveness.mjs <url1> [url2] ...
* node check-liveness.mjs --file urls.txt
*
* Exit code: 0 if all active, 1 if any expired or uncertain
*/
import { chromium } from 'playwright';
import { readFile } from 'fs/promises';
import {
checkUrlLivenessWithFallback,
createHeadedPageProvider,
newLivenessPage,
jitteredDelayMs,
sleep,
} from './liveness-browser.mjs';
async function main() {
const args = process.argv.slice(2);
// Portals like pracuj.pl serve a Cloudflare anti-bot wall to headless Chromium.
// On a challenge we retry once in a headed browser (which clears it); pass
// --no-fallback to stay fully headless (e.g. on a machine with no display).
const noFallback = args.includes('--no-fallback');
// --throttle or --throttle=<ms>: wait base..2*base ms (jittered) between checks
// to stay under rate-based WAF limits. pracuj.pl's Cloudflare flags the session
// after ~2 rapid hits, so a bulk run needs spacing. Default base 5000ms.
const throttleArg = args.find((a) => a === '--throttle' || a.startsWith('--throttle='));
const throttleBaseMs = throttleArg ? (Number(throttleArg.split('=')[1]) || 5000) : 0;
const positional = args.filter((a) => a !== '--no-fallback' && a !== throttleArg);
if (positional.length === 0) {
console.error('Usage: node check-liveness.mjs [--no-fallback] [--throttle[=ms]] <url1> [url2] ...');
console.error(' node check-liveness.mjs [--no-fallback] [--throttle[=ms]] --file urls.txt');
process.exit(1);
}
let urls;
if (positional[0] === '--file') {
const text = await readFile(positional[1], 'utf-8');
urls = text.split('\n').map(l => l.trim()).filter(l => l && !l.startsWith('#'));
} else {
urls = positional;
}
const notes = [
noFallback ? null : 'headed fallback on challenge',
throttleBaseMs ? `throttle ~${throttleBaseMs / 1000}-${(throttleBaseMs * 2) / 1000}s` : null,
].filter(Boolean);
console.log(`Checking ${urls.length} URL(s)...${notes.length ? ` (${notes.join(', ')})` : ''}\n`);
const browser = await chromium.launch({ headless: true });
const page = await newLivenessPage(browser);
const headed = noFallback ? null : createHeadedPageProvider(chromium);
const getHeadedPage = headed ? () => headed.get() : undefined;
let active = 0, expired = 0, uncertain = 0;
// Sequential — project rule: never Playwright in parallel
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
const { result, reason } = await checkUrlLivenessWithFallback(page, url, { getHeadedPage });
const icon = { active: '✅', expired: '❌', uncertain: '⚠️' }[result];
console.log(`${icon} ${result.padEnd(10)} ${url}`);
if (result !== 'active') console.log(` ${reason}`);
if (result === 'active') active++;
else if (result === 'expired') expired++;
else uncertain++;
const wait = i < urls.length - 1 ? jitteredDelayMs(throttleBaseMs) : 0;
if (wait) await sleep(wait);
}
if (headed) await headed.close();
await browser.close();
console.log(`\nResults: ${active} active ${expired} expired ${uncertain} uncertain`);
if (expired > 0 || uncertain > 0) process.exit(1);
}
main().catch(err => {
console.error('Fatal:', err.message);
process.exit(1);
});