From c62f49aa0e2e416ec9c0de99a4fd6c26abc1404a Mon Sep 17 00:00:00 2001 From: Kevin Heis Date: Thu, 17 Dec 2020 11:57:18 -0800 Subject: [PATCH] Block indexing not crawling (#17044) * Block indexing instead of crawling * Lint * Update deprecated-enterprise-versions.js * Combine loops --- .../archived-enterprise-versions-assets.js | 2 +- middleware/archived-enterprise-versions.js | 2 +- middleware/block-robots.js | 40 ++++++ middleware/index.js | 1 + middleware/robots.js | 31 +---- tests/rendering/block-robots.js | 117 ++++++++++++++++++ tests/rendering/robots-txt.js | 94 -------------- .../routing/deprecated-enterprise-versions.js | 2 +- 8 files changed, 162 insertions(+), 127 deletions(-) create mode 100644 middleware/block-robots.js create mode 100644 tests/rendering/block-robots.js diff --git a/middleware/archived-enterprise-versions-assets.js b/middleware/archived-enterprise-versions-assets.js index 22362e5a831b..e34d5adc80c3 100644 --- a/middleware/archived-enterprise-versions-assets.js +++ b/middleware/archived-enterprise-versions-assets.js @@ -25,7 +25,7 @@ module.exports = async (req, res, next) => { res.set('content-type', r.headers['content-type']) res.set('content-length', r.headers['content-length']) res.set('x-is-archived', 'true') - res.set('x-robots-tag', 'none') + res.set('x-robots-tag', 'noindex') res.send(r.body) } catch (err) { next() diff --git a/middleware/archived-enterprise-versions.js b/middleware/archived-enterprise-versions.js index c2c9f6a7ad8b..1ede1b15049b 100644 --- a/middleware/archived-enterprise-versions.js +++ b/middleware/archived-enterprise-versions.js @@ -36,7 +36,7 @@ module.exports = async (req, res, next) => { try { const r = await got(getProxyPath(req.path, requestedVersion)) res.set('content-type', r.headers['content-type']) - res.set('x-robots-tag', 'none') + res.set('x-robots-tag', 'noindex') // make the stubbed redirect files added in >=2.18 return 301 instead of 200 const staticRedirect = r.body.match(patterns.staticRedirect) diff --git a/middleware/block-robots.js b/middleware/block-robots.js new file mode 100644 index 000000000000..7a437972ded9 --- /dev/null +++ b/middleware/block-robots.js @@ -0,0 +1,40 @@ +const languages = require('../lib/languages') +const products = require('../lib/all-products') +const { deprecated } = require('../lib/enterprise-server-releases.js') + +const pathRegExps = [ + // Disallow indexing of WIP localized content + ...Object.values(languages) + .filter(language => language.wip) + .map(language => new RegExp(`^/${language.code}(/.*)?$`, 'i')), + + // Disallow indexing of WIP products + ...Object.values(products) + .filter(product => product.wip || product.hidden) + .map(product => [ + new RegExp(`^/.*?${product.href}`, 'i'), + ...product.versions.map( + version => new RegExp(`^/.*?${version}/${product.id}`, 'i') + ) + ]), + + // Disallow indexing of deprecated enterprise versions + ...deprecated + .map(version => [ + new RegExp(`^/.*?/enterprise-server@${version}/.*?`, 'i'), + new RegExp(`^/.*?/enterprise/${version}/.*?`, 'i') + ]) +].flat() + +function blockIndex (path) { + return pathRegExps.some(pathRe => pathRe.test(path)) +} + +const middleware = (req, res, next) => { + if (blockIndex(req.path)) res.set('x-robots-tag', 'noindex') + return next() +} + +middleware.blockIndex = blockIndex + +module.exports = middleware diff --git a/middleware/index.js b/middleware/index.js index 7996e9f54739..ce02a3d347da 100644 --- a/middleware/index.js +++ b/middleware/index.js @@ -56,6 +56,7 @@ module.exports = function (app) { // *** Config and context for rendering *** app.use(require('./find-page')) // Must come before archived-enterprise-versions, breadcrumbs, featured-links, products, render-page + app.use(require('./block-robots')) // *** Rendering, 2xx responses *** // I largely ordered these by use frequency diff --git a/middleware/robots.js b/middleware/robots.js index d00986cc8560..967d1f21dde9 100644 --- a/middleware/robots.js +++ b/middleware/robots.js @@ -1,33 +1,4 @@ -const languages = require('../lib/languages') -const products = require('../lib/all-products') -const { deprecated } = require('../lib/enterprise-server-releases.js') - -let defaultResponse = 'User-agent: *' - -// Disallow crawling of WIP localized content -Object.values(languages) - .filter(language => language.wip) - .forEach(language => { - defaultResponse = defaultResponse.concat(`\nDisallow: /${language.code}\nDisallow: /${language.code}/*\n`) - }) - -// Disallow crawling of WIP products -Object.values(products) - .filter(product => product.wip || product.hidden) - .forEach(product => { - defaultResponse = defaultResponse.concat(`\nDisallow: /*${product.href}`) - product.versions.forEach(version => { - defaultResponse = defaultResponse.concat(`\nDisallow: /*${version}/${product.id}`) - }) - }) - -// Disallow crawling of Deprecated enterprise versions -deprecated - .forEach(version => { - defaultResponse = defaultResponse - .concat(`\nDisallow: /*/enterprise-server@${version}/*`) - .concat(`\nDisallow: /*/enterprise/${version}/*`) - }) +const defaultResponse = 'User-agent: *' const disallowAll = `User-agent: * Disallow: /` diff --git a/tests/rendering/block-robots.js b/tests/rendering/block-robots.js new file mode 100644 index 000000000000..5110e21e0790 --- /dev/null +++ b/tests/rendering/block-robots.js @@ -0,0 +1,117 @@ +const { blockIndex } = require('../../middleware/block-robots') +const languages = require('../../lib/languages') +const products = require('../../lib/all-products') +const enterpriseServerReleases = require('../../lib/enterprise-server-releases') + +function allowIndex (path) { + return !blockIndex(path) +} + +describe('block robots', () => { + it('allows crawling of the homepage and English content', async () => { + expect(allowIndex('/')).toBe(true) + expect(allowIndex('/en')).toBe(true) + expect(allowIndex('/en/articles/verifying-your-email-address')).toBe(true) + }) + + it('allows crawling of generally available localized content', async () => { + Object.values(languages) + .filter(language => !language.wip) + .forEach(language => { + expect(allowIndex(`/${language.code}`)).toBe(true) + expect(allowIndex(`/${language.code}/articles/verifying-your-email-address`)).toBe(true) + }) + }) + + it('disallows crawling of WIP localized content', async () => { + Object.values(languages) + .filter(language => language.wip) + .forEach(language => { + expect(allowIndex(`/${language.code}`)).toBe(false) + expect(allowIndex(`/${language.code}/articles/verifying-your-email-address`)).toBe(false) + }) + }) + + it('disallows crawling of WIP products', async () => { + const wipProductIds = Object.values(products) + .filter(product => product.wip) + .map(product => product.id) + + wipProductIds.forEach(id => { + const { href } = products[id] + const blockedPaths = [ + // English + `/en${href}`, + `/en${href}/overview`, + `/en${href}/overview/intro`, + `/en/enterprise/${enterpriseServerReleases.latest}/user${href}`, + `/en/enterprise/${enterpriseServerReleases.oldestSupported}/user${href}`, + + // Japanese + `/ja${href}`, + `/ja${href}/overview`, + `/ja${href}/overview/intro`, + `/ja/enterprise/${enterpriseServerReleases.latest}/user${href}`, + `/ja/enterprise/${enterpriseServerReleases.oldestSupported}/user${href}` + ] + + blockedPaths.forEach(path => { + expect(allowIndex(path)).toBe(false) + }) + }) + }) + + it('disallows crawling of early access "hidden" products', async () => { + const hiddenProductIds = Object.values(products) + .filter(product => product.hidden) + .map(product => product.id) + + hiddenProductIds.forEach(id => { + const { versions } = products[id] + const blockedPaths = versions.map(version => { + return [ + // English + `/en/${version}/${id}`, + `/en/${version}/${id}/some-early-access-article`, + // Japanese + `/ja/${version}/${id}`, + `/ja/${version}/${id}/some-early-access-article` + ] + }).flat() + + blockedPaths.forEach(path => { + expect(allowIndex(path)).toBe(false) + }) + }) + }) + + it('allows crawling of non-WIP products', async () => { + expect('actions' in products).toBe(true) + expect(allowIndex('/en/actions')).toBe(true) + expect(allowIndex('/en/actions/overview')).toBe(true) + expect(allowIndex('/en/actions/overview/intro')).toBe(true) + expect(allowIndex(`/en/enterprise/${enterpriseServerReleases.latest}/user/actions`)).toBe(true) + expect(allowIndex(`/en/enterprise/${enterpriseServerReleases.oldestSupported}/user/actions`)).toBe(true) + }) + + it('disallows crawling of deprecated enterprise releases', async () => { + enterpriseServerReleases.deprecated.forEach(version => { + const blockedPaths = [ + // English + `/en/enterprise-server@${version}/actions`, + `/en/enterprise/${version}/actions`, + `/en/enterprise-server@${version}/actions/overview`, + `/en/enterprise/${version}/actions/overview`, + // Japanese + `/ja/enterprise-server@${version}/actions`, + `/ja/enterprise/${version}/actions`, + `/ja/enterprise-server@${version}/actions/overview`, + `/ja/enterprise/${version}/actions/overview` + ] + + blockedPaths.forEach(path => { + expect(allowIndex(path)).toBe(false) + }) + }) + }) +}) diff --git a/tests/rendering/robots-txt.js b/tests/rendering/robots-txt.js index 6bd78542697b..2835baa908c1 100644 --- a/tests/rendering/robots-txt.js +++ b/tests/rendering/robots-txt.js @@ -3,8 +3,6 @@ const robotsParser = require('robots-parser') const robotsMiddleware = require('../../middleware/robots') const { get } = require('../helpers/supertest') const MockExpressResponse = require('mock-express-response') -const products = require('../../lib/all-products') -const enterpriseServerReleases = require('../../lib/enterprise-server-releases') describe('robots.txt', () => { jest.setTimeout(5 * 60 * 1000) @@ -31,15 +29,6 @@ describe('robots.txt', () => { }) }) - it('disallows indexing of WIP localized content', async () => { - Object.values(languages) - .filter(language => language.wip) - .forEach(language => { - expect(robots.isAllowed(`https://docs.github.com/${language.code}`)).toBe(false) - expect(robots.isAllowed(`https://docs.github.com/${language.code}/articles/verifying-your-email-address`)).toBe(false) - }) - }) - it('disallows indexing of herokuapp.com domains', async () => { const req = { hostname: 'docs-internal-12345--my-branch.herokuapp.com', @@ -52,89 +41,6 @@ describe('robots.txt', () => { expect(res._getString()).toEqual('User-agent: *\nDisallow: /') }) - it('disallows indexing of WIP products', async () => { - const wipProductIds = Object.values(products) - .filter(product => product.wip) - .map(product => product.id) - - wipProductIds.forEach(id => { - const { href } = products[id] - const blockedPaths = [ - // English - `https://docs.github.com/en${href}`, - `https://docs.github.com/en${href}/overview`, - `https://docs.github.com/en${href}/overview/intro`, - `https://docs.github.com/en/enterprise/${enterpriseServerReleases.latest}/user${href}`, - `https://docs.github.com/en/enterprise/${enterpriseServerReleases.oldestSupported}/user${href}`, - - // Japanese - `https://docs.github.com/ja${href}`, - `https://docs.github.com/ja${href}/overview`, - `https://docs.github.com/ja${href}/overview/intro`, - `https://docs.github.com/ja/enterprise/${enterpriseServerReleases.latest}/user${href}`, - `https://docs.github.com/ja/enterprise/${enterpriseServerReleases.oldestSupported}/user${href}` - ] - - blockedPaths.forEach(path => { - expect(robots.isAllowed(path)).toBe(false) - }) - }) - }) - - it('disallows indexing of early access "hidden" products', async () => { - const hiddenProductIds = Object.values(products) - .filter(product => product.hidden) - .map(product => product.id) - - hiddenProductIds.forEach(id => { - const { versions } = products[id] - const blockedPaths = versions.map(version => { - return [ - // English - `https://docs.github.com/en/${version}/${id}`, - `https://docs.github.com/en/${version}/${id}/some-early-access-article`, - // Japanese - `https://docs.github.com/ja/${version}/${id}`, - `https://docs.github.com/ja/${version}/${id}/some-early-access-article` - ] - }).flat() - - blockedPaths.forEach(path => { - expect(robots.isAllowed(path)).toBe(false) - }) - }) - }) - - it('allows indexing of non-WIP products', async () => { - expect('actions' in products).toBe(true) - expect(robots.isAllowed('https://docs.github.com/en/actions')).toBe(true) - expect(robots.isAllowed('https://docs.github.com/en/actions/overview')).toBe(true) - expect(robots.isAllowed('https://docs.github.com/en/actions/overview/intro')).toBe(true) - expect(robots.isAllowed(`https://docs.github.com/en/enterprise/${enterpriseServerReleases.latest}/user/actions`)).toBe(true) - expect(robots.isAllowed(`https://docs.github.com/en/enterprise/${enterpriseServerReleases.oldestSupported}/user/actions`)).toBe(true) - }) - - it('disallows indexing of deprecated enterprise releases', async () => { - enterpriseServerReleases.deprecated.forEach(version => { - const blockedPaths = [ - // English - `https://docs.github.com/en/enterprise-server@${version}/actions`, - `https://docs.github.com/en/enterprise/${version}/actions`, - `https://docs.github.com/en/enterprise-server@${version}/actions/overview`, - `https://docs.github.com/en/enterprise/${version}/actions/overview`, - // Japanese - `https://docs.github.com/ja/enterprise-server@${version}/actions`, - `https://docs.github.com/ja/enterprise/${version}/actions`, - `https://docs.github.com/ja/enterprise-server@${version}/actions/overview`, - `https://docs.github.com/ja/enterprise/${version}/actions/overview` - ] - - blockedPaths.forEach(path => { - expect(robots.isAllowed(path)).toBe(false) - }) - }) - }) - it('does not have duplicate lines', () => { const lines = new Set() for (const line of res.text.split('\n')) { diff --git a/tests/routing/deprecated-enterprise-versions.js b/tests/routing/deprecated-enterprise-versions.js index 9b5ea537d781..79985e90a648 100644 --- a/tests/routing/deprecated-enterprise-versions.js +++ b/tests/routing/deprecated-enterprise-versions.js @@ -40,7 +40,7 @@ describe('enterprise deprecation', () => { test('sets the expected x-robots-tag header for deprecated Enterprise pages', async () => { const res = await get('/en/enterprise/2.13/user/articles/about-branches') expect(res.statusCode).toBe(200) - expect(res.get('x-robots-tag')).toBe('none') + expect(res.get('x-robots-tag')).toBe('noindex') }) test('handles requests for deprecated Enterprise pages ( <2.13 )', async () => {