Skip to content

Commit 7531c0f

Browse files
authored
shot-scraper multi --har, --har-zip, --har-file options
* shot-scraper multi --har, --har-zip, --har-file * Ability to run shot-craper multi --har and not take shots at all Closes #166
1 parent 1e8039f commit 7531c0f

File tree

4 files changed

+135
-6
lines changed

4 files changed

+135
-6
lines changed

docs/har.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
(har)=
12
# Saving a web page to an HTTP Archive
23

34
An HTTP Archive file captures the full details of a series of HTTP requests and responses as JSON.
@@ -51,6 +52,8 @@ Archive: datasette-io.har.zip
5152
154803 14 files
5253
```
5354

55+
You can record multiple pages to a single HTTP Archive using the {ref}`shot-scraper multi --har option<multi-har>`.
56+
5457
## `shot-scraper har --help`
5558

5659
Full `--help` for this command:

docs/multi.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,39 @@ You can include desired `height`, `width`, `quality`, `wait` and `wait_for` opti
109109
wait_for: document.querySelector('#bighead')
110110
```
111111

112+
(multi-har)=
113+
## Recording to an HTTP Archive
114+
115+
Similar to the {ref}`shot-scraper har command<har>`, `shot-scraper multi` can optionally record HTTP Archive files of the requests made during a session.
116+
117+
Add the `--har` flag to record all requests and responses to a `trace.har` JSON file, or `--har-zip` for a `trace.har.zip` file. Use `--har-file filename.har` to provide a path to a custom filename - this will be recorded as JSON or zip depending on the file extension.
118+
119+
For example:
120+
121+
```bash
122+
shot-scraper multi shots.yml --har
123+
```
124+
Will output something like this:
125+
```
126+
Screenshot of 'http://www.example.com/' written to 'example.com.png'
127+
Screenshot of 'https://www.w3.org/' written to 'w3c.org.png'
128+
Wrote to HAR file: trace.har
129+
```
130+
When writing to a HAR you can omit the `output:` key in a YAML file to skip taking a screenshot of that file. This `shots.yml` file for example:
131+
```yaml
132+
- url: https://example.com/
133+
- url: https://datasette.io/
134+
```
135+
When run like this:
136+
```bash
137+
shot-scraper multi shots.yml --har-zip
138+
```
139+
Will produce this output, recording a HAR without taking any screenshots:
140+
```
141+
Skipping screenshot of 'https://example.com/'
142+
Skipping screenshot of 'https://datasette.io/'
143+
Wrote to HAR file: trace.har.zip
144+
```
112145
## Running a server for the duration of the session
113146

114147
If you need to run a server for the duration of the `shot-scraper multi` session you can specify that using a `server:` block, like this:
@@ -213,6 +246,9 @@ Options:
213246
--auth-password TEXT Password for HTTP Basic authentication
214247
--auth-username TEXT Username for HTTP Basic authentication
215248
--leave-server Leave servers running when script finishes
249+
--har Save all requests to trace.har file
250+
--har-zip Save all requests to trace.har.zip file
251+
--har-file FILE Path to HAR file to save all requests
216252
--help Show this message and exit.
217253
```
218254
<!-- [[[end]]] -->

shot_scraper/cli.py

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,21 @@ def _browser_context(
486486
is_flag=True,
487487
help="Leave servers running when script finishes",
488488
)
489+
@click.option(
490+
"--har",
491+
is_flag=True,
492+
help="Save all requests to trace.har file",
493+
)
494+
@click.option(
495+
"--har-zip",
496+
is_flag=True,
497+
help="Save all requests to trace.har.zip file",
498+
)
499+
@click.option(
500+
"--har-file",
501+
type=click.Path(file_okay=True, writable=True, dir_okay=False),
502+
help="Path to HAR file to save all requests",
503+
)
489504
def multi(
490505
config,
491506
auth,
@@ -506,6 +521,9 @@ def multi(
506521
auth_username,
507522
auth_password,
508523
leave_server,
524+
har,
525+
har_zip,
526+
har_file,
509527
):
510528
"""
511529
Take multiple screenshots, defined by a YAML file
@@ -524,8 +542,20 @@ def multi(
524542
For full YAML syntax documentation, see:
525543
https://shot-scraper.datasette.io/en/stable/multi.html
526544
"""
545+
if (har or har_zip) and not har_file:
546+
har_file = filename_for_url(
547+
"trace", ext="har.zip" if har_zip else "har", file_exists=os.path.exists
548+
)
549+
527550
scale_factor = normalize_scale_factor(retina, scale_factor)
528551
shots = yaml.safe_load(config)
552+
553+
# Special case: if we are recording a har_file output can be blank to skip a shot
554+
if har_file:
555+
for shot in shots:
556+
if not shot.get("output"):
557+
shot["skip_shot"] = True
558+
529559
server_processes = []
530560
if shots is None:
531561
shots = []
@@ -543,6 +573,7 @@ def multi(
543573
reduced_motion=reduced_motion,
544574
auth_username=auth_username,
545575
auth_password=auth_password,
576+
record_har_path=har_file or None,
546577
)
547578
try:
548579
for shot in shots:
@@ -599,11 +630,16 @@ def multi(
599630
browser_obj.close()
600631
if leave_server:
601632
for process, details in server_processes:
602-
print("Leaving server PID:", process.pid, " details:", details)
633+
click.echo(
634+
f"Leaving server PID: {process.pid} details: {details}",
635+
err=True,
636+
)
603637
else:
604638
if server_processes:
605639
for process, _ in server_processes:
606640
process.kill()
641+
if har_file and not silent:
642+
click.echo(f"Wrote to HAR file: {har_file}", err=True)
607643

608644

609645
@cli.command()
@@ -1341,12 +1377,16 @@ def on_response(response):
13411377
", ".join(list(selectors) + list(selectors_all)), url, output
13421378
)
13431379
else:
1344-
# Whole page
1345-
if return_bytes:
1346-
return page.screenshot(**screenshot_args)
1380+
if shot.get("skip_shot"):
1381+
message = "Skipping screenshot of '{}'".format(url)
13471382
else:
1348-
page.screenshot(**screenshot_args)
1349-
message = "Screenshot of '{}' written to '{}'".format(url, output)
1383+
# Whole page
1384+
if return_bytes:
1385+
return page.screenshot(**screenshot_args)
1386+
else:
1387+
page.screenshot(**screenshot_args)
1388+
message = "Screenshot of '{}' written to '{}'".format(url, output)
1389+
13501390
if not silent:
13511391
click.echo(message, err=True)
13521392

tests/test_shot_scraper.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,3 +259,53 @@ def test_har(http_server, args, expect_zip):
259259
# Verify entries is a non-empty list
260260
assert isinstance(har_content["log"]["entries"], list)
261261
assert len(har_content["log"]["entries"]) > 0
262+
263+
264+
@pytest.mark.parametrize(
265+
"args,expect_zip,record_shots",
266+
(
267+
(["--har"], False, True),
268+
(["--har-zip"], True, True),
269+
(["--har-file", "output.har"], False, True),
270+
(["--har-file", "output.har.zip"], True, True),
271+
# And one where we don't record the shots:
272+
(["--har"], False, False),
273+
),
274+
)
275+
def test_multi_har(http_server, args, expect_zip, record_shots):
276+
runner = CliRunner()
277+
(http_server.base_dir / "two.html").write_text("<h1>Two</h1>")
278+
with runner.isolated_filesystem():
279+
pathlib.Path("shots.yml").write_text(
280+
f"- url: {http_server.base_url}/\n"
281+
+ (f" output: index.png\n" if record_shots else "")
282+
+ f"- url: {http_server.base_url}/two.html\n"
283+
+ (f" output: two.png\n" if record_shots else "")
284+
)
285+
# Should be no files
286+
here = pathlib.Path(".")
287+
files = [str(p) for p in here.glob("*.*")]
288+
assert files == ["shots.yml"]
289+
result = runner.invoke(cli, ["multi", "shots.yml"] + args)
290+
assert result.exit_code == 0
291+
if record_shots:
292+
assert result.output.startswith("Screenshot of 'http://localhost")
293+
else:
294+
assert result.output.startswith("Skipping screenshot of 'http://localhost")
295+
assert "Wrote to HAR file:" in result.output
296+
assert (".har.zip" in result.output) == expect_zip
297+
# HAR file should have been created
298+
if expect_zip:
299+
files = here.glob("*.har.zip")
300+
else:
301+
files = here.glob("*.har")
302+
har_files = list(files)
303+
# Should have created exactly one .har file
304+
assert len(har_files) == 1
305+
assert bool(zipfile.is_zipfile(har_files[0])) == expect_zip
306+
shot_files = list(here.glob("*.png"))
307+
num_shots = len(shot_files)
308+
if record_shots:
309+
assert num_shots == 2
310+
else:
311+
assert num_shots == 0

0 commit comments

Comments
 (0)