Skip to content

Commit 789624c

Browse files
authored
Merge pull request github#19001 from github/repo-sync
repo sync
2 parents f2f12aa + 52b4f09 commit 789624c

File tree

4 files changed

+91
-50
lines changed

4 files changed

+91
-50
lines changed

script/search/parse-page-sections-into-records.js

Lines changed: 35 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ export default function parsePageSectionsIntoRecords(page) {
5959
// pages that yields some decent content to be searched on, because
6060
// when you view these pages in a browser, there's clearly text there.
6161
if ($root.length > 0) {
62-
body = getAllText($, $root)
62+
body = getAllText($root)
6363
}
6464

6565
if (!body && !intro) {
@@ -85,55 +85,42 @@ export default function parsePageSectionsIntoRecords(page) {
8585
}
8686
}
8787

88-
function getAllText($, $root) {
89-
let text = ''
90-
91-
// We need this so we can know if we processed, for example,
92-
// a <td> followed by a <p> because if that's the case, don't use
93-
// a ' ' to concatenate the texts together but a '\n' instead.
94-
// That means, given this input:
95-
//
96-
// <p>Bla</p><table><tr><td>Foo</td><td>Bar</td></table><p>Hi again</p>
97-
//
98-
// we can produce this outcome:
99-
//
100-
// 'Bla\nFoo Bar\nHi again'
101-
//
102-
let previousTagName = ''
103-
104-
$('p, h2, h3, td, pre, li', $root).each((i, element) => {
105-
const $element = $(element)
106-
if (previousTagName === 'td' && element.tagName !== 'td') {
107-
text += '\n'
108-
}
109-
// Because our cheerio selector is all the block level tags,
110-
// what you might end up with is, from:
111-
//
112-
// <li><p>Text</p></li>
113-
// <li><pre>Code</pre></li>
114-
//
115-
// ['Text', 'Text', 'Code', 'Code']
116-
//
117-
// because it will spot both the <li> and the <p>.
118-
// If all HTML was exactly like that, you could omit the <li> selector,
119-
// but a lot of HTML is like this:
120-
//
121-
// <li>Bare text<li>
122-
//
123-
// So we need to bail if we're inside a block level element whose parent
124-
// already was a <li>.
125-
if ((element.tagName === 'p' || element.tagName === 'pre') && element.parent.tagName === 'li') {
126-
return
88+
function getAllText($root) {
89+
const inlineElements = new Set(
90+
`a,abbr,acronym,audio,b,bdi,bdo,big,br,button,canvas,cite,code,data,
91+
datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,label,map,mark,
92+
meter,noscript,object,output,picture,progress,q,ruby,s,samp,script,
93+
select,slot,small,span,strong,sub,sup,svg,template,textarea,time,
94+
tt,u,var,video,wbr`
95+
.split(',')
96+
.map((s) => s.trim())
97+
)
98+
99+
const walkTree = (node, callback, index = 0, level = 0) => {
100+
callback(node, index, level)
101+
for (let i = 0; i < (node.children || []).length; i++) {
102+
walkTree(node.children[i], callback, i, ++level)
103+
level--
127104
}
128-
text += $element.text()
129-
if (element.tagName === 'td') {
130-
text += ' '
131-
} else {
132-
text += '\n'
105+
}
106+
107+
const fragments = []
108+
109+
walkTree($root[0], (element) => {
110+
if (element.name === 'body') return
111+
112+
if (element.type === 'text') {
113+
const parentElement = element.parent || {}
114+
const previousElement = element.prev || {}
115+
let { data } = element
116+
if (data.trim()) {
117+
if (!inlineElements.has(parentElement.name) && !inlineElements.has(previousElement.name)) {
118+
data = `\n${data}`
119+
}
120+
fragments.push(data)
121+
}
133122
}
134-
previousTagName = element.tagName
135123
})
136-
text = text.trim().replace(/\s*[\r\n]+/g, '\n')
137124

138-
return text
125+
return fragments.join('').trim()
139126
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<div data-search="breadcrumbs">
2+
<nav class="breadcrumbs">
3+
<a href="#">GitHub Actions</a>
4+
<a href="#">actions learning path</a>
5+
<a href="#">I am the page title</a>
6+
</nav>
7+
</div>
8+
9+
<h1>I am the page title</h1>
10+
11+
<div data-search="lead">
12+
<p>This is an introduction to the article.</p>
13+
</div>
14+
15+
<div data-search="article-body">
16+
<h1>Heading</h1>
17+
18+
<!-- Deliberately no whitespace between tags -->
19+
<div><ul><ul><li><div><span><div><a href="foo"><h2>Adding an email address to your GitHub account</h2><p>GitHub, see "<a href="/en/articles/setting-your-commit-email-address">Setting your commit email address</a>."</p></a></div></span></div></li>
20+
<li><div><div><a href="/"><h2>Changing your primary email address</h2><p>You can change the email address associated with your personal account at any time.</p></a></div></span></div></li>
21+
</ul></ul></div>
22+
23+
</div>

tests/unit/search/fixtures/page-with-multiple-h1s.html

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,6 @@ <h1>I am the page title</h1>
1414

1515
<div data-search="article-body">
1616
<h1>A heading 1 inside the body</h1>
17-
<p>This won't be ignored.</p>
17+
18+
<div data-search="article-body" class="Box-sc-1gh2r6s-0 fWkkBJ"><div class="d-flex flex-items-baseline flex-justify-between"><h1 class="border-bottom-0">Managing email preferences</h1></div><div class="f2 color-fg-muted mb-3 Lead_container__g1kT8" data-search="lead">You can add or change the email addresses associated with your account on GitHub.com. You can also manage emails you receive from GitHub.</div><div class="border-bottom border-xl-0 pb-4 mb-5 pb-xl-2 mb-xl-2"></div><div class="mt-7"><ul data-testid="table-of-contents" class="list-style-none"><ul class="List__ListBox-sc-1x7olzq-0 iFaQQI"><li tabindex="0" aria-labelledby="react-aria-1029 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1029" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/adding-an-email-address-to-your-github-account"><h2 class="py-1 h4">Adding an email address to your GitHub account</h2><p class="f4 color-fg-muted">GitHub allows you to add as many email addresses to your account as you like. If you set an email address in your local Git configuration, you will need to add it to your account settings in order to connect your commits to your account. For more information about your email address and commits, see "<a href="/en/articles/setting-your-commit-email-address">Setting your commit email address</a>."</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1032 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1032" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/changing-your-primary-email-address"><h2 class="py-1 h4">Changing your primary email address</h2><p class="f4 color-fg-muted">You can change the email address associated with your personal account at any time.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1035 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1035" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/setting-a-backup-email-address"><h2 class="py-1 h4">Setting a backup email address</h2><p class="f4 color-fg-muted">Use a backup email address as an additional destination for security-relevant account notifications and to securely reset your password if you can no longer access your primary email address.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1038 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1038" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/setting-your-commit-email-address"><h2 class="py-1 h4">Setting your commit email address</h2><p class="f4 color-fg-muted">You can set the email address that is used to author commits on GitHub.com and on your computer.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1041 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1041" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/blocking-command-line-pushes-that-expose-your-personal-email-address"><h2 class="py-1 h4">Blocking command line pushes that expose your personal email address</h2><p class="f4 color-fg-muted">If you've chosen to keep your email address private when performing web-based operations, you can also choose to block command line pushes that may expose your personal email address.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1044 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1044" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/remembering-your-github-username-or-email"><h2 class="py-1 h4">Remembering your GitHub username or email</h2><p class="f4 color-fg-muted">Are you signing in to GitHub.com for the first time in a while? If so, welcome back! If you can't remember the username for your personal account on GitHub, you can try these methods for remembering it.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1047 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1047" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/types-of-emails-github-sends"><h2 class="py-1 h4">Types of emails GitHub sends</h2><p class="f4 color-fg-muted">There are several types of emails you can receive from GitHub, including notifications, account information, customer research invitations, and marketing communications.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1050 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1050" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/managing-marketing-emails-from-github"><h2 class="py-1 h4">Managing marketing emails from GitHub</h2><p class="f4 color-fg-muted">In addition to notifications and account emails, GitHub occasionally sends marketing emails with news and information about our products. If you unsubscribe from existing marketing emails, you won't be included in future campaigns unless you change your GitHub email settings.</p></a></div></span></div></li></ul></ul></div></div>
1819
</div>

tests/unit/search/parse-page-sections-into-records.js

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
import { fileURLToPath } from 'url'
22
import path from 'path'
33
import fs from 'fs/promises'
4+
45
import cheerio from 'cheerio'
6+
import { expect, test } from '@jest/globals'
7+
58
import parsePageSectionsIntoRecords from '../../../script/search/parse-page-sections-into-records.js'
69
const __dirname = path.dirname(fileURLToPath(import.meta.url))
710

@@ -22,6 +25,10 @@ const fixtures = {
2225
path.join(__dirname, 'fixtures/page-with-multiple-h1s.html'),
2326
'utf8'
2427
),
28+
pageHeadingParagraphNoWhitespace: await fs.readFile(
29+
path.join(__dirname, 'fixtures/page-with-heading-and-paragraph-no-whitespace.html'),
30+
'utf8'
31+
),
2532
}
2633

2734
describe('search parsePageSectionsIntoRecords module', () => {
@@ -40,7 +47,7 @@ describe('search parsePageSectionsIntoRecords module', () => {
4047
"In this article\nThis won't be ignored.\nFirst heading\n" +
4148
"Here's a paragraph.\nAnd another.\nSecond heading\n" +
4249
"Here's a paragraph in the second section.\nAnd another.\n" +
43-
'Table heading\nPeter Human\n' +
50+
'Table heading\nPeter\nHuman\n' +
4451
'Bullet\nPoint\nNumbered\nList\n' +
4552
"Further reading\nThis won't be ignored.",
4653
topics: ['topic1', 'topic2', 'GitHub Actions', 'Actions'],
@@ -90,4 +97,27 @@ describe('search parsePageSectionsIntoRecords module', () => {
9097
const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })
9198
expect(record.title).toEqual('I am the page title')
9299
})
100+
101+
test("content doesn't lump headings with paragraphs together", () => {
102+
const html = fixtures.pageHeadingParagraphNoWhitespace
103+
const $ = cheerio.load(html)
104+
const href = '/example/href'
105+
const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })
106+
107+
// This is a <h2> inside the page but it should only appear once.
108+
// We had a bug where the heading would be injected twice.
109+
// E.g.
110+
//
111+
// <h2>Heading</h2><p>Text here</p>
112+
//
113+
// would become:
114+
//
115+
// Heading\nHeadingText here
116+
//
117+
// So now we make sure it only appears exactly once.
118+
expect(record.content.match(/Changing your primary email address/g).length).toBe(1)
119+
// But note also that it would also concatenate the text of the heading
120+
// with the text of the paragraph without a whitespace in between.
121+
expect(record.content.includes('email addressYou can set')).toBeFalsy()
122+
})
93123
})

0 commit comments

Comments
 (0)