feat(gatsby-source-drupal): Use the collection count from JSON:API extras to enable parallel API requests for cold builds (#32883)

KyleAMathews · web-flow · commit 568d4ceb3212 · 2021-08-25T21:37:24.000-07:00
* feat(gatsby-source-drupal): Use the collection count from JSON:API extras to construct URLs

Otherwise, we have to wait to start querying each page until the previous one finishes. This change
lets us query all pages in parallel. So instead of fetching one collection page at a time, we can fetch up to the maximum concurrency (default 20).

For a test site with ~3200 entities, this PR dropped sourcing time from ~14s to 4s.

* use new browser-based URL parser

* Comment code more

* Use the page size the site has set instead of assuming 50

* Use the original type that's set as that's always there

* Log out updates while sourcing

* Encourage people to enable this setting in the README

* Update gatsby-node.js
diff --git a/packages/gatsby-source-drupal/README.md b/packages/gatsby-source-drupal/README.md
@@ -34,6 +34,12 @@ module.exports = {
 }
 ```
 
+On the Drupal side, we highly recommend installing [JSON:API
+Extras](https://www.drupal.org/project/jsonapi_extras) and enabling "Include
+count in collection queries" `/admin/config/services/jsonapi/extras` as that
+[speeds up fetching data from Drupal by around
+4x](https://github.com/gatsbyjs/gatsby/pull/32883).
+
 ### Filters
 
 You can use the `filters` option to limit the data that is retrieved from Drupal. Filters are applied per JSON API collection. You can use any [valid JSON API filter query](https://www.drupal.org/docs/8/modules/jsonapi/filtering). For large data sets this can reduce the build time of your application by allowing Gatsby to skip content you'll never use.
diff --git a/packages/gatsby-source-drupal/src/gatsby-node.js b/packages/gatsby-source-drupal/src/gatsby-node.js
@@ -22,7 +22,28 @@ const agent = {
   // http2: new http2wrapper.Agent(),
 }
 
+let start
+let apiRequestCount = 0
+let initialSourcing = true
+let globalReporter
 async function worker([url, options]) {
+  // Log out progress during the initial sourcing.
+  if (initialSourcing) {
+    apiRequestCount += 1
+    if (!start) {
+      start = Date.now()
+    }
+    const queueLength = requestQueue.length()
+    if (apiRequestCount % 50 === 0) {
+      globalReporter.verbose(
+        `gatsby-source-drupal has ${queueLength} API requests queued and the current request rate is ${(
+          apiRequestCount /
+          ((Date.now() - start) / 1000)
+        ).toFixed(2)} requests / second`
+      )
+    }
+  }
+
   return got(url, {
     agent,
     cache: false,
@@ -72,6 +93,7 @@ exports.sourceNodes = async (
   },
   pluginOptions
 ) => {
+  globalReporter = reporter
   const {
     baseUrl,
     apiBase = `jsonapi`,
@@ -293,6 +315,7 @@ exports.sourceNodes = async (
   drupalFetchActivity.start()
 
   let allData
+  const typeRequestsQueued = new Set()
   try {
     const res = await requestQueue.push([
       urlJoin(baseUrl, apiBase),
@@ -370,7 +393,39 @@ exports.sourceNodes = async (
           if (d.body.included) {
             dataArray.push(...d.body.included)
           }
-          if (d.body.links && d.body.links.next) {
+
+          // If JSON:API extras is configured to add the resource count, we can queue
+          // all API requests immediately instead of waiting for each request to return
+          // the next URL. This lets us request resources in parallel vs. sequentially
+          // which is much faster.
+          if (d.body.meta?.count) {
+            // If we hadn't added urls yet
+            if (d.body.links.next?.href && !typeRequestsQueued.has(type)) {
+              typeRequestsQueued.add(type)
+
+              // Get count of API requests
+              // We round down as we've already gotten the first page at this point.
+              const pageSize = new URL(d.body.links.next.href).searchParams.get(
+                `page[limit]`
+              )
+              const requestsCount = Math.floor(d.body.meta.count / pageSize)
+
+              reporter.verbose(
+                `queueing ${requestsCount} API requests for type ${type} which has ${d.body.meta.count} entities.`
+              )
+
+              const newUrl = new URL(d.body.links.next.href)
+              await Promise.all(
+                _.range(requestsCount).map(pageOffset => {
+                  // We're starting 1 ahead.
+                  pageOffset += 1
+                  // Construct URL with new pageOffset.
+                  newUrl.searchParams.set(`page[offset]`, pageOffset * pageSize)
+                  return getNext(newUrl.toString())
+                })
+              )
+            }
+          } else if (d.body.links?.next) {
             await getNext(d.body.links.next)
           }
         }
@@ -480,6 +535,9 @@ exports.sourceNodes = async (
     createNode(node)
   }
 
+  // We're now done with the initial sourcing.
+  initialSourcing = false
+
   return
 }