Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Commit cac1011

Browse files
authored
Update workflow to get the archived, deprecated and malicious data from S3 (#1034)
Updates the import_packages workflow to get the archived, deprecated and malicious data from S3 instead of codegate-data repo.
1 parent 3e82eba commit cac1011

File tree

1 file changed

+32
-31
lines changed

1 file changed

+32
-31
lines changed

.github/workflows/import_packages.yml

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,16 @@ name: Sync vector DB
33

44
on:
55
workflow_dispatch:
6-
inputs:
7-
enable_artifact_download:
8-
description: 'Enable artifact download step'
9-
type: boolean
10-
required: false
11-
default: true
6+
127
jobs:
13-
# This workflow contains a single job called "greet"
148
sync_db:
159
# The type of runner that the job will run on
1610
runs-on: ubuntu-latest
11+
permissions:
12+
contents: read
13+
id-token: write
14+
env:
15+
AWS_REGION: us-east-1
1716

1817
# Steps represent a sequence of tasks that will be executed as part of the job
1918
steps:
@@ -31,32 +30,34 @@ jobs:
3130
git lfs install
3231
git lfs pull
3332
34-
- name: Download json data
35-
id: download-json-data
36-
uses: dawidd6/action-download-artifact@20319c5641d495c8a52e688b7dc5fada6c3a9fbc # v8
33+
- name: Configure AWS Credentials for S3
34+
uses: aws-actions/configure-aws-credentials@49f33fe638c0cba4fb16037a27915a7ab7740259
3735
with:
38-
repo: stacklok/codegate-data
39-
workflow: ".github/workflows/generate-artifact.yml"
40-
workflow_conclusion: success
41-
name: jsonl-files
42-
path: /tmp/
43-
name_is_regexp: true
44-
skip_unpack: false
45-
if_no_artifact_found: ignore
36+
role-to-assume: ${{ secrets.AWS_ROLE_INSIGHT_DATA_IMPORT }}
37+
aws-region: ${{ env.AWS_REGION }}
4638

47-
- name: Download artifact
48-
if: ${{ github.event.inputs.enable_artifact_download == 'true' }}
49-
id: download-artifact
50-
uses: dawidd6/action-download-artifact@20319c5641d495c8a52e688b7dc5fada6c3a9fbc # v8
51-
with:
52-
github_token: ${{ github.token }}
53-
workflow: ".github/workflows/import_packages.yml"
54-
workflow_conclusion: success
55-
name: sqlite_data
56-
path: /tmp/
57-
name_is_regexp: true
58-
skip_unpack: false
59-
if_no_artifact_found: ignore
39+
- name: Download JSONL files from S3
40+
run: |
41+
echo "Downloading manifest.json from S3..."
42+
aws s3 cp s3://codegate-data-prod/manifest.json ./manifest.json --region $AWS_REGION
43+
echo "Manifest content:"
44+
cat manifest.json
45+
46+
echo "Parsing manifest..."
47+
MALICIOUS_KEY=$(jq -r '.latest.malicious_packages' manifest.json)
48+
DEPRECATED_KEY=$(jq -r '.latest.deprecated_packages' manifest.json)
49+
ARCHIVED_KEY=$(jq -r '.latest.archived_packages' manifest.json)
50+
51+
echo "Malicious key: $MALICIOUS_KEY"
52+
echo "Deprecated key: $DEPRECATED_KEY"
53+
echo "Archived key: $ARCHIVED_KEY"
54+
55+
mkdir -p /tmp/jsonl-files
56+
57+
# Download and map the S3 files to fixed names in /tmp/jsonl-files
58+
aws s3 cp s3://codegate-data-prod/$MALICIOUS_KEY /tmp/jsonl-files/malicious.jsonl --region $AWS_REGION
59+
aws s3 cp s3://codegate-data-prod/$DEPRECATED_KEY /tmp/jsonl-files/deprecated.jsonl --region $AWS_REGION
60+
aws s3 cp s3://codegate-data-prod/$ARCHIVED_KEY /tmp/jsonl-files/archived.jsonl --region $AWS_REGION
6061
6162
- name: Install Poetry
6263
run: |

0 commit comments

Comments
 (0)