Skip to content

Commit 573d2e7

Browse files
How to work with S3 bucker
1 parent fe27599 commit 573d2e7

File tree

6 files changed

+283
-2
lines changed

6 files changed

+283
-2
lines changed

README.md

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,36 @@
11
# MLFLOW
22

3-
MLflow is an open-source platform to manage the complete ML lifecycle.
3+
MLflow is an open-source platform to manage the complete ML lifecycle.
44

55
## Deployment
6-
The [NCSA](https://github.com/ncsa/charts/tree/main/charts/mlflow) helm chart is used for this deployment.
6+
The [NCSA](https://github.com/ncsa/charts/tree/main/charts/mlflow) helm chart is used for this deployment.
77

88
```bash
99
helm upgrade mlflow ncsa/mlflow --values values.yaml -n mlflow
1010
```
1111

1212
Don't forget to update the image version in Dockerfile and Helm charts.
13+
14+
## Accessing S3 bucket
15+
16+
1. Create s3.ini
17+
```bash
18+
host_base = <url>
19+
host_bucket = <url>
20+
bucket_location = us-east-1
21+
use_https = True
22+
23+
# Setup access keys
24+
access_key = <key>
25+
secret_key = <secret>
26+
27+
# Enable S3 v4 signature APIs
28+
signature_v2 = false
29+
```
30+
31+
2. Access the bucket with these commands
32+
```bash
33+
s3cmd -c s3cmd.ini
34+
s3cmd -c s3cmd.ini ls
35+
s3cmd -c s3cmd.ini ls s3://mlflow
36+
```

ceph-s3/config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
aws_access_key_id:
2+
aws_secret_access_key:
3+
bucket_name:
4+
object_name:
5+
endpoint_url:
6+

ceph-s3/mlflow3-example.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import pandas as pd
2+
import torch
3+
import torch.nn as nn
4+
from sklearn.datasets import load_iris
5+
from sklearn.model_selection import train_test_split
6+
7+
import mlflow
8+
import mlflow.pytorch
9+
from mlflow.entities import Dataset
10+
11+
mlflow.set_tracking_uri("https://ard-mlflow.slac.stanford.edu")
12+
13+
# Helper function to prepare data
14+
def prepare_data(df):
15+
X = torch.tensor(df.iloc[:, :-1].values, dtype=torch.float32)
16+
y = torch.tensor(df.iloc[:, -1].values, dtype=torch.long)
17+
return X, y
18+
19+
20+
# Helper function to compute accuracy
21+
def compute_accuracy(model, X, y):
22+
with torch.no_grad():
23+
outputs = model(X)
24+
_, predicted = torch.max(outputs, 1)
25+
accuracy = (predicted == y).sum().item() / y.size(0)
26+
return accuracy
27+
28+
29+
# Define a basic PyTorch classifier
30+
class IrisClassifier(nn.Module):
31+
def __init__(self, input_size, hidden_size, output_size):
32+
super().__init__()
33+
self.fc1 = nn.Linear(input_size, hidden_size)
34+
self.relu = nn.ReLU()
35+
self.fc2 = nn.Linear(hidden_size, output_size)
36+
37+
def forward(self, x):
38+
x = self.fc1(x)
39+
x = self.relu(x)
40+
x = self.fc2(x)
41+
return x
42+
43+
44+
# Load Iris dataset and prepare the DataFrame
45+
iris = load_iris()
46+
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
47+
iris_df["target"] = iris.target
48+
49+
# Split into training and testing datasets
50+
train_df, test_df = train_test_split(iris_df, test_size=0.2, random_state=42)
51+
52+
# Prepare training data
53+
train_dataset = mlflow.data.from_pandas(train_df, name="train")
54+
X_train, y_train = prepare_data(train_dataset.df)
55+
56+
# Define the PyTorch model and move it to the device
57+
input_size = X_train.shape[1]
58+
hidden_size = 16
59+
output_size = len(iris.target_names)
60+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
61+
scripted_model = IrisClassifier(input_size, hidden_size, output_size).to(device)
62+
scripted_model = torch.jit.script(scripted_model)
63+
64+
# Start a run to represent the training job
65+
with mlflow.start_run() as run:
66+
# Load the training dataset with MLflow. We will link training metrics to this dataset.
67+
train_dataset: Dataset = mlflow.data.from_pandas(train_df, name="train")
68+
X_train, y_train = prepare_data(train_dataset.df)
69+
70+
criterion = nn.CrossEntropyLoss()
71+
optimizer = torch.optim.Adam(scripted_model.parameters(), lr=0.01)
72+
73+
for epoch in range(101):
74+
X_train = X_train.to(device)
75+
y_train = y_train.to(device)
76+
out = scripted_model(X_train)
77+
loss = criterion(out, y_train)
78+
optimizer.zero_grad()
79+
loss.backward()
80+
optimizer.step()
81+
82+
# Log a checkpoint with metrics every 10 epochs
83+
if epoch % 10 == 0:
84+
# Each newly created LoggedModel checkpoint is linked with its name and step
85+
model_info = mlflow.pytorch.log_model(
86+
pytorch_model=scripted_model,
87+
name=f"torch-iris-{epoch}",
88+
step=epoch,
89+
input_example=X_train.numpy(),
90+
)
91+
# log params to the run, LoggedModel inherits those params
92+
mlflow.log_params(
93+
params={
94+
"n_layers": 3,
95+
"activation": "ReLU",
96+
"criterion": "CrossEntropyLoss",
97+
"optimizer": "Adam",
98+
}
99+
)
100+
# Log metric on training dataset at step and link to LoggedModel
101+
mlflow.log_metric(
102+
key="accuracy",
103+
value=compute_accuracy(scripted_model, X_train, y_train),
104+
step=epoch,
105+
model_id=model_info.model_id,
106+
dataset=train_dataset,
107+
)
108+
109+
ranked_checkpoints = mlflow.search_logged_models(
110+
filter_string=f"source_run_id='{run.info.run_id}'",
111+
order_by=[{"field_name": "metrics.accuracy", "ascending": False}],
112+
output_format="list",
113+
)
114+
115+
best_checkpoint = ranked_checkpoints[0]
116+
print(f"Best model: {best_checkpoint}")
117+
print(best_checkpoint.metrics)
118+
119+
# Best model: <LoggedModel: artifact_location='file:///Users/serena.ruan/Documents/repos/mlflow-3-doc/mlruns/0/models/41bd5a16-25a6-447b-90e0-0f7b7e5cb6cf/artifacts', creation_timestamp=1743734069924, experiment_id='0', last_updated_timestamp=1743734075018, metrics=[<Metric: dataset_digest='1f1c13b5', dataset_name='train', key='accuracy', model_id='41bd5a16-25a6-447b-90e0-0f7b7e5cb6cf', run_id='12f143a7fda1461e9240d7ffad4ea5bd', step=100, timestamp=1743734075029, value=0.975>], model_id='41bd5a16-25a6-447b-90e0-0f7b7e5cb6cf', model_type='', model_uri='models:/41bd5a16-25a6-447b-90e0-0f7b7e5cb6cf', name='torch-iris-100', params={'activation': 'ReLU',
120+
# 'criterion': 'CrossEntropyLoss',
121+
# 'n_layers': '3',
122+
# 'optimizer': 'Adam'}, source_run_id='12f143a7fda1461e9240d7ffad4ea5bd', status=<LoggedModelStatus.READY: 'READY'>, status_message='', tags={'mlflow.source.git.commit': '7324c807f07a1766d4b951733e3d723504b4576e',
123+
# 'mlflow.source.name': 'a.py',
124+
# 'mlflow.source.type': 'LOCAL',
125+
# 'mlflow.user': 'serena.ruan'}>
126+
# [<Metric: dataset_digest='1f1c13b5', dataset_name='train', key='accuracy', model_id='41bd5a16-25a6-447b-90e0-0f7b7e5cb6cf', run_id='12f143a7fda1461e9240d7ffad4ea5bd', step=100, timestamp=1743734075029, value=0.975>]
127+
128+
worst_checkpoint = ranked_checkpoints[-1]
129+
print(f"Worst model: {worst_checkpoint}")
130+
print(worst_checkpoint.metrics)
131+
132+
# Worst model: <LoggedModel: artifact_location='file:///Users/serena.ruan/Documents/repos/mlflow-3-doc/mlruns/0/models/0d789084-9a3b-4b85-9d43-6a148c014b7e/artifacts', creation_timestamp=1743734016290, experiment_id='0', last_updated_timestamp=1743734022728, metrics=[<Metric: dataset_digest='1f1c13b5', dataset_name='train', key='accuracy', model_id='0d789084-9a3b-4b85-9d43-6a148c014b7e', run_id='12f143a7fda1461e9240d7ffad4ea5bd', step=0, timestamp=1743734022737, value=0.3>], model_id='0d789084-9a3b-4b85-9d43-6a148c014b7e', model_type='', model_uri='models:/0d789084-9a3b-4b85-9d43-6a148c014b7e', name='torch-iris-0', params={}, source_run_id='12f143a7fda1461e9240d7ffad4ea5bd', status=<LoggedModelStatus.READY: 'READY'>, status_message='', tags={'mlflow.source.git.commit': '7324c807f07a1766d4b951733e3d723504b4576e',
133+
# 'mlflow.source.name': 'a.py',
134+
# 'mlflow.source.type': 'LOCAL',
135+
# 'mlflow.user': 'serena.ruan'}>
136+
# [<Metric: dataset_digest='1f1c13b5', dataset_name='train', key='accuracy', model_id='0d789084-9a3b-4b85-9d43-6a148c014b7e', run_id='12f143a7fda1461e9240d7ffad4ea5bd', step=0, timestamp=1743734022737, value=0.3>]
137+

ceph-s3/s3-demo.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import boto3
2+
import pandas as pd
3+
import yaml
4+
import mlflow
5+
from sklearn.model_selection import train_test_split
6+
from sklearn.ensemble import RandomForestClassifier
7+
from sklearn.metrics import accuracy_score
8+
9+
with open("config.yaml") as f:
10+
config = yaml.safe_load(f)
11+
12+
aws_access_key_id = config["aws_access_key_id"]
13+
aws_secret_access_key = config["aws_secret_access_key"]
14+
bucket_name = config["bucket_name"]
15+
object_name = config["object_name"]
16+
local_file = "wine-quality.csv"
17+
endpoint_url = config.get("endpoint_url")
18+
19+
s3_client = boto3.client(
20+
"s3",
21+
aws_access_key_id=aws_access_key_id,
22+
aws_secret_access_key=aws_secret_access_key,
23+
region_name="us-east-1", # optional if needed
24+
endpoint_url=endpoint_url
25+
)
26+
27+
print(f"Downloading {object_name} from bucket {bucket_name}...")
28+
s3_client.download_file(bucket_name, object_name, local_file)
29+
30+
df = pd.read_csv(local_file)
31+
print("Dataset shape:", df.shape)
32+
print(df.head())
33+
34+
X = df.drop("quality", axis=1)
35+
y = df["quality"]
36+
37+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
38+
39+
mlflow.set_tracking_uri("https://ard-mlflow.slac.stanford.edu")
40+
mlflow.set_experiment("S3 Boto3 Demo")
41+
mlflow.sklearn.autolog()
42+
43+
with mlflow.start_run() as run:
44+
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
45+
model.fit(X_train, y_train)
46+
47+
preds = model.predict(X_test)
48+
acc = accuracy_score(y_test, preds)
49+
50+
mlflow.log_metric("accuracy", acc)
51+
52+
input_example = pd.DataFrame(X_train.iloc[:1])
53+
54+
mlflow.sklearn.log_model(
55+
sk_model=model,
56+
name="model",
57+
registered_model_name="wine_quality_model",
58+
input_example=input_example
59+
)
60+
61+
print("Run ID:", run.info.run_id)
62+
print("Accuracy logged:", acc)
63+
print("Model registered as: wine_quality_model")
64+
65+
model_uri = "models:/wine_quality_model/7"
66+
loaded_model = mlflow.pyfunc.load_model(model_uri)
67+
#model_uri = "mlartifacts/0/models/m-ce23c6a1b14b41a091e6c0b549122d0c/artifacts"
68+
#model = mlflow.pyfunc.load_model(model_uri)
69+
run_id = loaded_model.metadata.run_id
70+
print(run_id)
71+
72+
sample = X_test.iloc[:5]
73+
preds_loaded = loaded_model.predict(sample)
74+
print("Sample predictions from registry:", preds_loaded)
75+
76+
df_compare = pd.DataFrame({
77+
"Actual": y_test.iloc[:5].values,
78+
"Predicted": preds_loaded}
79+
)
80+
print("\nComparison of actual vs predictions:")
81+
print(df_compare)

ceph-s3/upload.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import boto3
2+
3+
aws_access_key_id = ""
4+
aws_secret_access_key = ""
5+
bucket_name = "mlflow-backup"
6+
file_path = "wine-quality.csv" # Local file to upload
7+
object_name = "wine-quality.csv" # How it will be named in S3
8+
9+
# If you have a custom S3 endpoint, pass endpoint_url="https://s3.yourprovider.com"
10+
s3_client = boto3.client(
11+
"s3",
12+
aws_access_key_id=aws_access_key_id,
13+
aws_secret_access_key=aws_secret_access_key,
14+
region_name="", # optional if needed
15+
endpoint_url=""
16+
)
17+
18+
# Upload the file
19+
s3_client.upload_file(file_path, bucket_name, object_name)
20+
21+
print(f"Uploaded {file_path} to s3://{bucket_name}/{object_name}")
22+

s3.ini

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
host_base =
2+
host_bucket =
3+
bucket_location = us-east-1
4+
use_https = True
5+
6+
# Setup access keys
7+
access_key =
8+
secret_key =
9+
10+
# Enable S3 v4 signature APIs
11+
signature_v2 = false

0 commit comments

Comments
 (0)