Skip to content

Commit 1e52267

Browse files
cpcloudclaude
andcommitted
feat(extract): persist extraction model and operations JSON (closes #764)
Store which LLM model produced the extraction and the raw operations JSON alongside the document. The model column is visible in the documents table; the ops blob is stored for future inspection (see #766). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 68a58d7 commit 1e52267

File tree

9 files changed

+125
-26
lines changed

9 files changed

+125
-26
lines changed

internal/app/coldefs.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ var documentColumnDefs = []columnDef{
305305
{"Entity", columnSpec{Title: "Entity", Min: 10, Max: 24, Flex: true, Kind: cellEntity}},
306306
{"Type", columnSpec{Title: "Type", Min: 8, Max: 16}},
307307
{"Size", columnSpec{Title: "Size", Min: 6, Max: 10, Align: alignRight, Kind: cellReadonly}},
308+
{"Model", columnSpec{Title: "Model", Min: 8, Max: 20, Kind: cellReadonly}},
308309
{"Notes", columnSpec{Title: "Notes", Min: 12, Max: 40, Flex: true, Kind: cellNotes}},
309310
{"Updated", columnSpec{Title: "Updated", Min: 10, Max: 12, Kind: cellReadonly}},
310311
}

internal/app/columns_generated.go

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/app/extraction.go

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,8 @@ func (m *Model) acceptDeferredExtraction() error {
885885
if len(ex.pendingData) > 0 {
886886
doc.ExtractData = ex.pendingData
887887
}
888+
doc.ExtractionModel = m.extractionModelUsed(ex)
889+
doc.ExtractionOps = marshalOps(ex.operations)
888890

889891
if err := m.store.CreateDocument(doc); err != nil {
890892
return fmt.Errorf("create document: %w", err)
@@ -909,11 +911,13 @@ func (m *Model) acceptDeferredExtraction() error {
909911
func (m *Model) acceptExistingExtraction() error {
910912
ex := m.ex.extraction
911913

912-
// Persist async extraction results.
913-
if ex.pendingText != "" || len(ex.pendingData) > 0 {
914+
// Persist async extraction results and the model that produced them.
915+
if ex.pendingText != "" || len(ex.pendingData) > 0 || ex.hasLLM {
914916
if m.store != nil {
917+
model := m.extractionModelUsed(ex)
918+
ops := marshalOps(ex.operations)
915919
if err := m.store.UpdateDocumentExtraction(
916-
ex.DocID, ex.pendingText, ex.pendingData,
920+
ex.DocID, ex.pendingText, ex.pendingData, model, ops,
917921
); err != nil {
918922
return fmt.Errorf("save extraction: %w", err)
919923
}
@@ -1965,6 +1969,28 @@ func stepName(si extractionStep) string {
19651969
return "?"
19661970
}
19671971

1972+
// marshalOps serializes extraction operations to JSON for persistence.
1973+
// Returns nil when there are no operations.
1974+
func marshalOps(ops []extract.Operation) []byte {
1975+
if len(ops) == 0 {
1976+
return nil
1977+
}
1978+
b, err := json.Marshal(ops)
1979+
if err != nil {
1980+
return nil
1981+
}
1982+
return b
1983+
}
1984+
1985+
// extractionModelUsed returns the model name if the LLM step completed
1986+
// successfully, or empty string if LLM was skipped or failed.
1987+
func (m *Model) extractionModelUsed(ex *extractionLogState) string {
1988+
if ex.hasLLM && ex.Steps[stepLLM].Status == stepDone {
1989+
return m.extractionModelLabel()
1990+
}
1991+
return ""
1992+
}
1993+
19681994
// extractionModelLabel returns the model name used for extraction.
19691995
func (m *Model) extractionModelLabel() string {
19701996
if m.ex.extractionModel != "" {
@@ -2015,9 +2041,9 @@ func previewColumns(tableName string, cur locale.Currency) []previewColDef {
20152041
case tableDocuments:
20162042
s := documentColumnSpecs()
20172043
return []previewColDef{
2018-
{data.ColTitle, s[1], fmtAnyText},
2019-
{data.ColMIMEType, s[3], fmtAnyText},
2020-
{data.ColNotes, s[5], fmtAnyText},
2044+
{data.ColTitle, s[documentColTitle], fmtAnyText},
2045+
{data.ColMIMEType, s[documentColType], fmtAnyText},
2046+
{data.ColNotes, s[documentColNotes], fmtAnyText},
20212047
}
20222048
case data.TableQuotes:
20232049
s := quoteColumnSpecs()

internal/app/extraction_test.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3134,6 +3134,63 @@ func TestExtractionTSVToggle_StatusMessage(t *testing.T) {
31343134
assert.Contains(t, m.status.Text, "layout off")
31353135
}
31363136

3137+
// ---------------------------------------------------------------------------
3138+
// extractionModelUsed tests
3139+
// ---------------------------------------------------------------------------
3140+
3141+
func TestExtractionModelUsed_ReturnsModelWhenLLMSucceeded(t *testing.T) {
3142+
t.Parallel()
3143+
m := newExtractionModel(t, map[extractionStep]stepStatus{
3144+
stepText: stepDone,
3145+
stepExtract: stepDone,
3146+
stepLLM: stepDone,
3147+
})
3148+
m.ex.extractionModel = "test-extraction-model"
3149+
3150+
result := m.extractionModelUsed(m.ex.extraction)
3151+
assert.Equal(t, "test-extraction-model", result)
3152+
}
3153+
3154+
func TestExtractionModelUsed_FallsBackToChatModel(t *testing.T) {
3155+
t.Parallel()
3156+
m := newExtractionModel(t, map[extractionStep]stepStatus{
3157+
stepText: stepDone,
3158+
stepExtract: stepDone,
3159+
stepLLM: stepDone,
3160+
})
3161+
m.ex.extractionModel = ""
3162+
m.llmClient = testExtractionOllamaClient(t, "chat-model")
3163+
3164+
result := m.extractionModelUsed(m.ex.extraction)
3165+
assert.Equal(t, "chat-model", result)
3166+
}
3167+
3168+
func TestExtractionModelUsed_EmptyWhenLLMSkipped(t *testing.T) {
3169+
t.Parallel()
3170+
m := newExtractionModel(t, map[extractionStep]stepStatus{
3171+
stepText: stepDone,
3172+
stepExtract: stepDone,
3173+
})
3174+
ex := m.ex.extraction
3175+
ex.hasLLM = false
3176+
3177+
result := m.extractionModelUsed(ex)
3178+
assert.Equal(t, "", result)
3179+
}
3180+
3181+
func TestExtractionModelUsed_EmptyWhenLLMFailed(t *testing.T) {
3182+
t.Parallel()
3183+
m := newExtractionModel(t, map[extractionStep]stepStatus{
3184+
stepText: stepDone,
3185+
stepExtract: stepDone,
3186+
stepLLM: stepFailed,
3187+
})
3188+
m.ex.extractionModel = "test-model"
3189+
3190+
result := m.extractionModelUsed(m.ex.extraction)
3191+
assert.Equal(t, "", result)
3192+
}
3193+
31373194
func TestExtractionTSVToggle_HintShownInFooter(t *testing.T) {
31383195
t.Parallel()
31393196
m := newExtractionModel(t, map[extractionStep]stepStatus{

internal/app/tables.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,7 @@ func documentRows(docs []data.Document, names entityNameMap) ([]table.Row, []row
662662
},
663663
{Value: d.MIMEType, Kind: cellText},
664664
{Value: formatFileSize(docSizeBytes(d)), Kind: cellReadonly},
665+
{Value: d.ExtractionModel, Kind: cellReadonly},
665666
{Value: d.Notes, Kind: cellNotes},
666667
{Value: d.UpdatedAt.Format(data.DateLayout), Kind: cellReadonly},
667668
},
@@ -679,6 +680,7 @@ func entityDocumentRows(docs []data.Document) ([]table.Row, []rowMeta, [][]cell)
679680
{Value: d.Title, Kind: cellText},
680681
{Value: d.MIMEType, Kind: cellText},
681682
{Value: formatFileSize(docSizeBytes(d)), Kind: cellReadonly},
683+
{Value: d.ExtractionModel, Kind: cellReadonly},
682684
{Value: d.Notes, Kind: cellNotes},
683685
{Value: d.UpdatedAt.Format(data.DateLayout), Kind: cellReadonly},
684686
},

internal/data/fts_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ func TestSearchDocumentsUpdateReflected(t *testing.T) {
158158
id := docs[0].ID
159159

160160
// Update extraction text.
161-
require.NoError(t, store.UpdateDocumentExtraction(id, "new text about plumbing", nil))
161+
require.NoError(t, store.UpdateDocumentExtraction(id, "new text about plumbing", nil, "", nil))
162162

163163
results, err := store.SearchDocuments("plumbing")
164164
require.NoError(t, err)

internal/data/meta_generated.go

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/data/models.go

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -271,21 +271,23 @@ type ServiceLogEntry struct {
271271
}
272272

273273
type Document struct {
274-
ID uint `gorm:"primaryKey"`
275-
Title string
276-
FileName string `gorm:"column:file_name"`
277-
EntityKind string `gorm:"index:idx_doc_entity"`
278-
EntityID uint `gorm:"index:idx_doc_entity"`
279-
MIMEType string ` extract:"-"`
280-
SizeBytes int64 ` extract:"-"`
281-
ChecksumSHA256 string `gorm:"column:sha256" extract:"-"`
282-
Data []byte
283-
ExtractedText string ` extract:"-"`
284-
ExtractData []byte `gorm:"column:ocr_data"`
285-
Notes string
286-
CreatedAt time.Time
287-
UpdatedAt time.Time
288-
DeletedAt gorm.DeletedAt `gorm:"index"`
274+
ID uint `gorm:"primaryKey"`
275+
Title string
276+
FileName string `gorm:"column:file_name"`
277+
EntityKind string `gorm:"index:idx_doc_entity"`
278+
EntityID uint `gorm:"index:idx_doc_entity"`
279+
MIMEType string ` extract:"-"`
280+
SizeBytes int64 ` extract:"-"`
281+
ChecksumSHA256 string `gorm:"column:sha256" extract:"-"`
282+
Data []byte
283+
ExtractedText string ` extract:"-"`
284+
ExtractData []byte `gorm:"column:ocr_data"`
285+
ExtractionModel string ` extract:"-"`
286+
ExtractionOps []byte `gorm:"column:extraction_ops" extract:"-"`
287+
Notes string
288+
CreatedAt time.Time
289+
UpdatedAt time.Time
290+
DeletedAt gorm.DeletedAt `gorm:"index"`
289291
}
290292

291293
type DeletionRecord struct {

internal/data/store.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,8 +1106,8 @@ func (s *Store) CountIncidentsByVendor(vendorIDs []uint) (map[uint]int, error) {
11061106
// avoid loading the potentially large Data BLOB.
11071107
var listDocumentColumns = []string{
11081108
ColID, ColTitle, ColFileName, ColEntityKind, ColEntityID,
1109-
ColMIMEType, ColSizeBytes, ColChecksumSHA256, ColNotes,
1110-
ColCreatedAt, ColUpdatedAt, ColDeletedAt,
1109+
ColMIMEType, ColSizeBytes, ColChecksumSHA256, ColExtractionModel,
1110+
ColNotes, ColCreatedAt, ColUpdatedAt, ColDeletedAt,
11111111
}
11121112

11131113
func (s *Store) ListDocuments(includeDeleted bool) ([]Document, error) {
@@ -1216,9 +1216,17 @@ func (s *Store) UpdateDocument(doc Document) error {
12161216
// UpdateDocumentExtraction persists async extraction results on a document
12171217
// without touching other fields. Called from the extraction overlay after
12181218
// async extraction completes.
1219-
func (s *Store) UpdateDocumentExtraction(id uint, text string, data []byte) error {
1219+
func (s *Store) UpdateDocumentExtraction(
1220+
id uint,
1221+
text string,
1222+
data []byte,
1223+
model string,
1224+
ops []byte,
1225+
) error {
12201226
updates := map[string]any{
1221-
ColExtractData: data,
1227+
ColExtractData: data,
1228+
ColExtractionModel: model,
1229+
ColExtractionOps: ops,
12221230
}
12231231
if text != "" {
12241232
updates[ColExtractedText] = text

0 commit comments

Comments
 (0)