Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 35 additions & 48 deletions script/search/parse-page-sections-into-records.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ export default function parsePageSectionsIntoRecords(page) {
// pages that yields some decent content to be searched on, because
// when you view these pages in a browser, there's clearly text there.
if ($root.length > 0) {
body = getAllText($, $root)
body = getAllText($root)
}

if (!body && !intro) {
Expand All @@ -85,55 +85,42 @@ export default function parsePageSectionsIntoRecords(page) {
}
}

function getAllText($, $root) {
let text = ''

// We need this so we can know if we processed, for example,
// a <td> followed by a <p> because if that's the case, don't use
// a ' ' to concatenate the texts together but a '\n' instead.
// That means, given this input:
//
// <p>Bla</p><table><tr><td>Foo</td><td>Bar</td></table><p>Hi again</p>
//
// we can produce this outcome:
//
// 'Bla\nFoo Bar\nHi again'
//
let previousTagName = ''

$('p, h2, h3, td, pre, li', $root).each((i, element) => {
const $element = $(element)
if (previousTagName === 'td' && element.tagName !== 'td') {
text += '\n'
}
// Because our cheerio selector is all the block level tags,
// what you might end up with is, from:
//
// <li><p>Text</p></li>
// <li><pre>Code</pre></li>
//
// ['Text', 'Text', 'Code', 'Code']
//
// because it will spot both the <li> and the <p>.
// If all HTML was exactly like that, you could omit the <li> selector,
// but a lot of HTML is like this:
//
// <li>Bare text<li>
//
// So we need to bail if we're inside a block level element whose parent
// already was a <li>.
if ((element.tagName === 'p' || element.tagName === 'pre') && element.parent.tagName === 'li') {
return
function getAllText($root) {
const inlineElements = new Set(
`a,abbr,acronym,audio,b,bdi,bdo,big,br,button,canvas,cite,code,data,
datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,label,map,mark,
meter,noscript,object,output,picture,progress,q,ruby,s,samp,script,
select,slot,small,span,strong,sub,sup,svg,template,textarea,time,
tt,u,var,video,wbr`
.split(',')
.map((s) => s.trim())
)

const walkTree = (node, callback, index = 0, level = 0) => {
callback(node, index, level)
for (let i = 0; i < (node.children || []).length; i++) {
walkTree(node.children[i], callback, i, ++level)
level--
}
text += $element.text()
if (element.tagName === 'td') {
text += ' '
} else {
text += '\n'
}

const fragments = []

walkTree($root[0], (element) => {
if (element.name === 'body') return

if (element.type === 'text') {
const parentElement = element.parent || {}
const previousElement = element.prev || {}
let { data } = element
if (data.trim()) {
if (!inlineElements.has(parentElement.name) && !inlineElements.has(previousElement.name)) {
data = `\n${data}`
}
fragments.push(data)
}
}
previousTagName = element.tagName
})
text = text.trim().replace(/\s*[\r\n]+/g, '\n')

return text
return fragments.join('').trim()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<div data-search="breadcrumbs">
<nav class="breadcrumbs">
<a href="#">GitHub Actions</a>
<a href="#">actions learning path</a>
<a href="#">I am the page title</a>
</nav>
</div>

<h1>I am the page title</h1>

<div data-search="lead">
<p>This is an introduction to the article.</p>
</div>

<div data-search="article-body">
<h1>Heading</h1>

<!-- Deliberately no whitespace between tags -->
<div><ul><ul><li><div><span><div><a href="foo"><h2>Adding an email address to your GitHub account</h2><p>GitHub, see "<a href="/en/articles/setting-your-commit-email-address">Setting your commit email address</a>."</p></a></div></span></div></li>
<li><div><div><a href="/"><h2>Changing your primary email address</h2><p>You can change the email address associated with your personal account at any time.</p></a></div></span></div></li>
</ul></ul></div>

</div>
3 changes: 2 additions & 1 deletion tests/unit/search/fixtures/page-with-multiple-h1s.html
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ <h1>I am the page title</h1>

<div data-search="article-body">
<h1>A heading 1 inside the body</h1>
<p>This won't be ignored.</p>

<div data-search="article-body" class="Box-sc-1gh2r6s-0 fWkkBJ"><div class="d-flex flex-items-baseline flex-justify-between"><h1 class="border-bottom-0">Managing email preferences</h1></div><div class="f2 color-fg-muted mb-3 Lead_container__g1kT8" data-search="lead">You can add or change the email addresses associated with your account on GitHub.com. You can also manage emails you receive from GitHub.</div><div class="border-bottom border-xl-0 pb-4 mb-5 pb-xl-2 mb-xl-2"></div><div class="mt-7"><ul data-testid="table-of-contents" class="list-style-none"><ul class="List__ListBox-sc-1x7olzq-0 iFaQQI"><li tabindex="0" aria-labelledby="react-aria-1029 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1029" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/adding-an-email-address-to-your-github-account"><h2 class="py-1 h4">Adding an email address to your GitHub account</h2><p class="f4 color-fg-muted">GitHub allows you to add as many email addresses to your account as you like. If you set an email address in your local Git configuration, you will need to add it to your account settings in order to connect your commits to your account. For more information about your email address and commits, see "<a href="/en/articles/setting-your-commit-email-address">Setting your commit email address</a>."</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1032 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1032" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/changing-your-primary-email-address"><h2 class="py-1 h4">Changing your primary email address</h2><p class="f4 color-fg-muted">You can change the email address associated with your personal account at any time.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1035 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1035" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/setting-a-backup-email-address"><h2 class="py-1 h4">Setting a backup email address</h2><p class="f4 color-fg-muted">Use a backup email address as an additional destination for security-relevant account notifications and to securely reset your password if you can no longer access your primary email address.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1038 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1038" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/setting-your-commit-email-address"><h2 class="py-1 h4">Setting your commit email address</h2><p class="f4 color-fg-muted">You can set the email address that is used to author commits on GitHub.com and on your computer.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1041 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1041" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/blocking-command-line-pushes-that-expose-your-personal-email-address"><h2 class="py-1 h4">Blocking command line pushes that expose your personal email address</h2><p class="f4 color-fg-muted">If you've chosen to keep your email address private when performing web-based operations, you can also choose to block command line pushes that may expose your personal email address.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1044 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1044" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/remembering-your-github-username-or-email"><h2 class="py-1 h4">Remembering your GitHub username or email</h2><p class="f4 color-fg-muted">Are you signing in to GitHub.com for the first time in a while? If so, welcome back! If you can't remember the username for your personal account on GitHub, you can try these methods for remembering it.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1047 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1047" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/types-of-emails-github-sends"><h2 class="py-1 h4">Types of emails GitHub sends</h2><p class="f4 color-fg-muted">There are several types of emails you can receive from GitHub, including notifications, account information, customer research invitations, and marketing communications.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1050 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1050" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/managing-marketing-emails-from-github"><h2 class="py-1 h4">Managing marketing emails from GitHub</h2><p class="f4 color-fg-muted">In addition to notifications and account emails, GitHub occasionally sends marketing emails with news and information about our products. If you unsubscribe from existing marketing emails, you won't be included in future campaigns unless you change your GitHub email settings.</p></a></div></span></div></li></ul></ul></div></div>
</div>
32 changes: 31 additions & 1 deletion tests/unit/search/parse-page-sections-into-records.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import { fileURLToPath } from 'url'
import path from 'path'
import fs from 'fs/promises'

import cheerio from 'cheerio'
import { expect, test } from '@jest/globals'

import parsePageSectionsIntoRecords from '../../../script/search/parse-page-sections-into-records.js'
const __dirname = path.dirname(fileURLToPath(import.meta.url))

Expand All @@ -22,6 +25,10 @@ const fixtures = {
path.join(__dirname, 'fixtures/page-with-multiple-h1s.html'),
'utf8'
),
pageHeadingParagraphNoWhitespace: await fs.readFile(
path.join(__dirname, 'fixtures/page-with-heading-and-paragraph-no-whitespace.html'),
'utf8'
),
}

describe('search parsePageSectionsIntoRecords module', () => {
Expand All @@ -40,7 +47,7 @@ describe('search parsePageSectionsIntoRecords module', () => {
"In this article\nThis won't be ignored.\nFirst heading\n" +
"Here's a paragraph.\nAnd another.\nSecond heading\n" +
"Here's a paragraph in the second section.\nAnd another.\n" +
'Table heading\nPeter Human\n' +
'Table heading\nPeter\nHuman\n' +
'Bullet\nPoint\nNumbered\nList\n' +
"Further reading\nThis won't be ignored.",
topics: ['topic1', 'topic2', 'GitHub Actions', 'Actions'],
Expand Down Expand Up @@ -90,4 +97,27 @@ describe('search parsePageSectionsIntoRecords module', () => {
const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })
expect(record.title).toEqual('I am the page title')
})

test("content doesn't lump headings with paragraphs together", () => {
const html = fixtures.pageHeadingParagraphNoWhitespace
const $ = cheerio.load(html)
const href = '/example/href'
const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })

// This is a <h2> inside the page but it should only appear once.
// We had a bug where the heading would be injected twice.
// E.g.
//
// <h2>Heading</h2><p>Text here</p>
//
// would become:
//
// Heading\nHeadingText here
//
// So now we make sure it only appears exactly once.
expect(record.content.match(/Changing your primary email address/g).length).toBe(1)
// But note also that it would also concatenate the text of the heading
// with the text of the paragraph without a whitespace in between.
expect(record.content.includes('email addressYou can set')).toBeFalsy()
})
})