Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
24 changes: 24 additions & 0 deletions packages/lib/services/ocr/OcrService.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -248,4 +248,28 @@ describe('OcrService', () => {
// await service.dispose();
// });

it('should generate text even on cases of lower confidence', async () => {
const { resource } = await createNoteAndResource({ path: `${ocrSampleDir}/low_confidence_testing.png` });

const service = newOcrService();
await service.processResources();

const processedResource: ResourceEntity = await Resource.load(resource.id);
expect(processedResource.ocr_text.includes('1.')).toBe(true);
// cSpell:disable
expect(processedResource.ocr_text.includes('eback Mountain (2005)')).toBe(true);
// cSpell:enable

expect(processedResource.ocr_text.includes('2.')).toBe(true);
expect(processedResource.ocr_text.includes('Havoc (2005)')).toBe(true);

expect(processedResource.ocr_text.includes('3.')).toBe(true);
expect(processedResource.ocr_text.includes('Love & Other Drugs (2010)')).toBe(true);

expect(processedResource.ocr_text.includes('4.')).toBe(true);
expect(processedResource.ocr_text.includes('The Last Thing He Wanted (2020)')).toBe(true);

await service.dispose();
});

});
13 changes: 9 additions & 4 deletions packages/lib/services/ocr/drivers/OcrDriverTesseract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,15 @@ const formatTesseractBoundingBox = (boundingBox: Tesseract.Bbox): RecognizeResul
return [boundingBox.x0, boundingBox.x1, boundingBox.y0, boundingBox.y1];
};

// Empirically, it seems anything below 70 is not usable. Between 70 and 75 it's
// hit and miss, but often it's good enough that we should keep the result.
// Above this is usually reliable.
const minConfidence = 70;
// 2023-12-13: Empirically, it seems anything below 70 is not usable. Between 70
// and 75 it's hit and miss, but often it's good enough that we should keep the result.
// Above this is usually reliable. Using 70 for now.
//
// 2025-04-03: Changed to 55 to detect text in images that are supported in
// other tools but were not in Joplin.
//
// https://github.com/laurent22/joplin/issues/11608
const minConfidence = 55;

interface Options {
workerPath: string;
Expand Down