Skip to content
This repository was archived by the owner on Oct 30, 2024. It is now read-only.

Commit 5015b13

Browse files
authored
change: simpler way of handling metadata inserted by .knowledge.json (#156)
1 parent e4bd7fb commit 5015b13

File tree

2 files changed

+70
-54
lines changed

2 files changed

+70
-54
lines changed

pkg/client/common.go

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -94,12 +94,13 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID
9494
}
9595

9696
if fileInfo.IsDir() {
97-
initialMetadata := &Metadata{Metadata: map[string]FileMetadata{}}
98-
directoryMetadata, err := loadAndMergeMetadata(path, initialMetadata)
97+
directoryMetadata, err := loadDirMetadata(path)
9998
if err != nil {
10099
return ingestedFilesCount, err
101100
}
102-
metadataStack = append(metadataStack, *directoryMetadata)
101+
if directoryMetadata != nil {
102+
metadataStack = append(metadataStack, *directoryMetadata)
103+
}
103104

104105
// Process directory
105106
err = filepath.WalkDir(path, func(subPath string, d os.DirEntry, err error) error {
@@ -115,12 +116,13 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID
115116
}
116117

117118
// One dir level deeper -> load new metadata
118-
parentMetadata := metadataStack[len(metadataStack)-1]
119-
newMetadata, err := loadAndMergeMetadata(subPath, &parentMetadata)
119+
newMetadata, err := loadDirMetadata(subPath)
120120
if err != nil {
121121
return err
122122
}
123-
metadataStack = append(metadataStack, *newMetadata)
123+
if newMetadata != nil {
124+
metadataStack = append(metadataStack, *newMetadata)
125+
}
124126
return nil
125127
}
126128

@@ -141,16 +143,19 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID
141143
}
142144
touchedFilePaths = append(touchedFilePaths, absPath)
143145

144-
currentMetadata := metadataStack[len(metadataStack)-1]
145-
146146
g.Go(func() error {
147147
if err := sem.Acquire(ctx, 1); err != nil {
148148
return err
149149
}
150150
defer sem.Release(1)
151151

152-
slog.Debug("Ingesting file", "path", absPath, "metadata", currentMetadata)
153-
err = ingestionFunc(sp, currentMetadata.Metadata[filepath.Base(sp)]) // FIXME: metadata
152+
fileMeta, err := findMetadata(absPath, metadataStack)
153+
if err != nil {
154+
return fmt.Errorf("failed to find metadata for %s: %w", absPath, err)
155+
}
156+
slog.Debug("Ingesting file", "absPath", absPath, "metadata", fileMeta)
157+
158+
err = ingestionFunc(sp, fileMeta)
154159
if err == nil {
155160
ingestedFilesCount++
156161
}
@@ -161,8 +166,6 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID
161166
if err != nil {
162167
return ingestedFilesCount, err
163168
}
164-
// Directory processed, pop metadata
165-
metadataStack = metadataStack[:len(metadataStack)-1]
166169
} else {
167170
if isIgnored(ignore, path) {
168171
slog.Debug("Ignoring file", "path", path, "ignorefile", opts.IgnoreFile, "ignoreExtensions", opts.IgnoreExtensions)
@@ -181,16 +184,12 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID
181184
}
182185
defer sem.Release(1)
183186

184-
var fileMetadata FileMetadata
185-
if len(metadataStack) > 0 {
186-
currentMetadata := metadataStack[len(metadataStack)-1]
187-
fileMetadata = currentMetadata.Metadata[filepath.Base(path)]
188-
}
189-
err = ingestionFunc(path, fileMetadata)
190-
if err == nil {
191-
ingestedFilesCount++
187+
ingestedFilesCount++
188+
fileMeta, err := findMetadata(absPath, metadataStack)
189+
if err != nil {
190+
return fmt.Errorf("failed to find metadata for %s: %w", absPath, err)
192191
}
193-
return err
192+
return ingestionFunc(path, fileMeta)
194193
})
195194
}
196195

pkg/client/metadata.go

Lines changed: 50 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package client
33
import (
44
"encoding/json"
55
"fmt"
6+
"log/slog"
67
"os"
78
"path/filepath"
89
"strings"
@@ -11,51 +12,67 @@ import (
1112
const MetadataFilename = ".knowledge.json"
1213

1314
type Metadata struct {
14-
Metadata map[string]FileMetadata `json:"metadata"` // Map of file paths to metadata
15+
MetadataFileAbsPath string
16+
Metadata map[string]FileMetadata `json:"metadata"` // Map of file paths to metadata
1517
// TODO (idea): add other fields like description here, so we can hierarchically build a dataset description? Challenge is pruning and merging.
1618
}
1719

1820
type FileMetadata map[string]any
1921

20-
func loadAndMergeMetadata(dirPath string, parentMetadata *Metadata) (*Metadata, error) {
22+
// loadAndMergeMetadata checks if the given directory contains a metadata file.
23+
// If so, it reads it in and merges it with the previous level of metadata.
24+
// Doing so, the parentMetadata is trimmed down to only the entries relevant to this directory.
25+
func loadDirMetadata(dirPath string) (*Metadata, error) {
2126
metadataPath := filepath.Join(dirPath, MetadataFilename)
22-
dirName := filepath.Base(dirPath)
23-
if _, err := os.Stat(metadataPath); err == nil { // Metadata file exists
24-
fileContent, err := os.ReadFile(metadataPath)
25-
if err != nil {
26-
return nil, fmt.Errorf("failed to read metadata file %s: %w", metadataPath, err)
27-
}
27+
metaAbsPath, err := filepath.Abs(metadataPath)
28+
if err != nil {
29+
return nil, fmt.Errorf("failed to get absolute path for %s: %w", metadataPath, err)
30+
}
31+
dirPath = filepath.Dir(metadataPath)
32+
if _, err := os.Stat(metadataPath); err != nil {
33+
return nil, nil
34+
}
35+
// Metadata file exists
36+
fileContent, err := os.ReadFile(metadataPath)
37+
if err != nil {
38+
return nil, fmt.Errorf("failed to read metadata file %s: %w", metadataPath, err)
39+
}
2840

29-
var newMetadata Metadata
30-
if err := json.Unmarshal(fileContent, &newMetadata); err != nil {
31-
return nil, fmt.Errorf("failed to unmarshal metadata file %s: %w", metadataPath, err)
32-
}
41+
metadata := &Metadata{
42+
MetadataFileAbsPath: metaAbsPath,
43+
}
44+
if err := json.Unmarshal(fileContent, &metadata); err != nil {
45+
return nil, fmt.Errorf("failed to unmarshal metadata file %s: %w", metadataPath, err)
46+
}
3347

34-
// Merge with parent metadata, overriding existing keys
35-
mergedMetadata := &Metadata{Metadata: make(map[string]FileMetadata, len(parentMetadata.Metadata)+len(newMetadata.Metadata))}
36-
for filename, fileMetadata := range parentMetadata.Metadata {
37-
if !strings.HasPrefix(filename, dirName) {
38-
// skip entries which are not meant for this (sub-)directory
39-
continue
40-
}
41-
fname := strings.TrimPrefix(strings.TrimPrefix(filename, dirName), string(filepath.Separator))
42-
mergedMetadata.Metadata[fname] = fileMetadata
43-
}
48+
slog.Info("Loaded metadata", "path", metadataPath, "metadata", metadata.Metadata)
49+
50+
return metadata, nil
4451

45-
if newMetadata.Metadata != nil {
46-
for filename, fileMetadata := range newMetadata.Metadata {
47-
for k, v := range fileMetadata {
48-
if mergedMetadata.Metadata[filename] == nil {
49-
mergedMetadata.Metadata[filename] = make(FileMetadata, len(fileMetadata))
50-
}
51-
mergedMetadata.Metadata[filename][k] = v
52-
}
52+
}
53+
54+
func findMetadata(path string, metadataStack []Metadata) (FileMetadata, error) {
55+
56+
absPath, err := filepath.Abs(path)
57+
if err != nil {
58+
return nil, err
59+
}
60+
61+
metadata := make(map[string]any)
62+
63+
for _, metadataEntry := range metadataStack {
64+
target := strings.TrimPrefix(strings.TrimPrefix(absPath, filepath.Dir(metadataEntry.MetadataFileAbsPath)), string(filepath.Separator))
65+
66+
if m, ok := metadataEntry.Metadata[target]; ok {
67+
for k, v := range m {
68+
metadata[k] = v
5369
}
5470
}
5571

56-
return mergedMetadata, nil
5772
}
5873

59-
// No metadata file, return parent metadata as is
60-
return parentMetadata, nil
74+
slog.Debug("Found metadata", "path", path, "metadata", metadata)
75+
76+
return metadata, nil
77+
6178
}

0 commit comments

Comments
 (0)