@@ -62,14 +62,15 @@ def generate_s3_file_path(
62
62
user_name : str ,
63
63
repo_name : str ,
64
64
commit : str ,
65
+ subpath : str ,
65
66
include_patterns : set [str ] | None ,
66
67
ignore_patterns : set [str ],
67
68
) -> str :
68
69
"""Generate S3 file path with proper naming convention.
69
70
70
71
The file path is formatted as:
71
72
[<S3_DIRECTORY_PREFIX>/]ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/
72
- <exclude&include hash>/<owner>-<repo-name>.txt
73
+ <exclude&include hash>/<owner>-<repo-name>-<subpath-hash> .txt
73
74
74
75
If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path.
75
76
The commit-ID is always included in the URL.
@@ -85,6 +86,8 @@ def generate_s3_file_path(
85
86
Repository name.
86
87
commit : str
87
88
Commit hash.
89
+ subpath : str
90
+ Subpath of the repository.
88
91
include_patterns : set[str] | None
89
92
Set of patterns specifying which files to include.
90
93
ignore_patterns : set[str]
@@ -111,9 +114,10 @@ def generate_s3_file_path(
111
114
patterns_str = f"include:{ sorted (include_patterns ) if include_patterns else []} "
112
115
patterns_str += f"exclude:{ sorted (ignore_patterns )} "
113
116
patterns_hash = hashlib .sha256 (patterns_str .encode ()).hexdigest ()[:16 ]
117
+ subpath_hash = hashlib .sha256 (subpath .encode ()).hexdigest ()[:16 ]
114
118
115
- # Build the base path using hostname directly
116
- base_path = f"ingest/{ hostname } /{ user_name } /{ repo_name } /{ commit } /{ patterns_hash } /{ user_name } - { repo_name } .txt "
119
+ file_name = f" { user_name } - { repo_name } - { subpath_hash } .txt"
120
+ base_path = f"ingest/{ hostname } /{ user_name } /{ repo_name } /{ commit } /{ patterns_hash } /{ file_name } "
117
121
118
122
# Check for S3_DIRECTORY_PREFIX environment variable
119
123
s3_directory_prefix = os .getenv ("S3_DIRECTORY_PREFIX" )
0 commit comments