How to work with S3 bucker

bhardwaj-gopika · bhardwaj-gopika · commit 573d2e7e2e9a · 2025-09-30T16:18:19.000-07:00
diff --git a/README.md b/README.md
@@ -1,12 +1,36 @@
 # MLFLOW
 
-MLflow is an open-source platform to manage the complete ML lifecycle. 
+MLflow is an open-source platform to manage the complete ML lifecycle.
 
 ## Deployment
-The [NCSA](https://github.com/ncsa/charts/tree/main/charts/mlflow) helm chart is used for this deployment. 
+The [NCSA](https://github.com/ncsa/charts/tree/main/charts/mlflow) helm chart is used for this deployment.
 
 ```bash
 helm upgrade  mlflow ncsa/mlflow --values values.yaml -n mlflow
 ```
 
 Don't forget to update the image version in Dockerfile and Helm charts.
+
+## Accessing S3 bucket
+
+1. Create s3.ini
+```bash
+host_base = <url>
+host_bucket = <url>
+bucket_location = us-east-1
+use_https = True
+
+# Setup access keys
+access_key = <key>
+secret_key = <secret>
+
+# Enable S3 v4 signature APIs
+signature_v2 = false
+``` 
+
+2. Access the bucket with these commands 
+```bash
+s3cmd -c s3cmd.ini
+s3cmd -c s3cmd.ini ls
+s3cmd -c s3cmd.ini ls s3://mlflow
+```
diff --git a/ceph-s3/config.yaml b/ceph-s3/config.yaml
@@ -0,0 +1,6 @@
+aws_access_key_id: 
+aws_secret_access_key: 
+bucket_name: 
+object_name: 
+endpoint_url: 
+
diff --git a/ceph-s3/mlflow3-example.py b/ceph-s3/mlflow3-example.py
@@ -0,0 +1,137 @@
+import pandas as pd
+import torch
+import torch.nn as nn
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+
+import mlflow
+import mlflow.pytorch
+from mlflow.entities import Dataset
+
+mlflow.set_tracking_uri("https://ard-mlflow.slac.stanford.edu")
+
+# Helper function to prepare data
+def prepare_data(df):
+    X = torch.tensor(df.iloc[:, :-1].values, dtype=torch.float32)
+    y = torch.tensor(df.iloc[:, -1].values, dtype=torch.long)
+    return X, y
+
+
+# Helper function to compute accuracy
+def compute_accuracy(model, X, y):
+    with torch.no_grad():
+        outputs = model(X)
+        _, predicted = torch.max(outputs, 1)
+        accuracy = (predicted == y).sum().item() / y.size(0)
+    return accuracy
+
+
+# Define a basic PyTorch classifier
+class IrisClassifier(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super().__init__()
+        self.fc1 = nn.Linear(input_size, hidden_size)
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        return x
+
+
+# Load Iris dataset and prepare the DataFrame
+iris = load_iris()
+iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
+iris_df["target"] = iris.target
+
+# Split into training and testing datasets
+train_df, test_df = train_test_split(iris_df, test_size=0.2, random_state=42)
+
+# Prepare training data
+train_dataset = mlflow.data.from_pandas(train_df, name="train")
+X_train, y_train = prepare_data(train_dataset.df)
+
+# Define the PyTorch model and move it to the device
+input_size = X_train.shape[1]
+hidden_size = 16
+output_size = len(iris.target_names)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+scripted_model = IrisClassifier(input_size, hidden_size, output_size).to(device)
+scripted_model = torch.jit.script(scripted_model)
+
+# Start a run to represent the training job
+with mlflow.start_run() as run:
+    # Load the training dataset with MLflow. We will link training metrics to this dataset.
+    train_dataset: Dataset = mlflow.data.from_pandas(train_df, name="train")
+    X_train, y_train = prepare_data(train_dataset.df)
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(scripted_model.parameters(), lr=0.01)
+
+    for epoch in range(101):
+        X_train = X_train.to(device)
+        y_train = y_train.to(device)
+        out = scripted_model(X_train)
+        loss = criterion(out, y_train)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # Log a checkpoint with metrics every 10 epochs
+        if epoch % 10 == 0:
+            # Each newly created LoggedModel checkpoint is linked with its name and step
+            model_info = mlflow.pytorch.log_model(
+                pytorch_model=scripted_model,
+                name=f"torch-iris-{epoch}",
+                step=epoch,
+                input_example=X_train.numpy(),
+            )
+            # log params to the run, LoggedModel inherits those params
+            mlflow.log_params(
+                params={
+                    "n_layers": 3,
+                    "activation": "ReLU",
+                    "criterion": "CrossEntropyLoss",
+                    "optimizer": "Adam",
+                }
+            )
+            # Log metric on training dataset at step and link to LoggedModel
+            mlflow.log_metric(
+                key="accuracy",
+                value=compute_accuracy(scripted_model, X_train, y_train),
+                step=epoch,
+                model_id=model_info.model_id,
+                dataset=train_dataset,
+            )
+
+ranked_checkpoints = mlflow.search_logged_models(
+    filter_string=f"source_run_id='{run.info.run_id}'",
+    order_by=[{"field_name": "metrics.accuracy", "ascending": False}],
+    output_format="list",
+)
+
+best_checkpoint = ranked_checkpoints[0]
+print(f"Best model: {best_checkpoint}")
+print(best_checkpoint.metrics)
+
+# Best model: <LoggedModel: artifact_location='file:///Users/serena.ruan/Documents/repos/mlflow-3-doc/mlruns/0/models/41bd5a16-25a6-447b-90e0-0f7b7e5cb6cf/artifacts', creation_timestamp=1743734069924, experiment_id='0', last_updated_timestamp=1743734075018, metrics=[<Metric: dataset_digest='1f1c13b5', dataset_name='train', key='accuracy', model_id='41bd5a16-25a6-447b-90e0-0f7b7e5cb6cf', run_id='12f143a7fda1461e9240d7ffad4ea5bd', step=100, timestamp=1743734075029, value=0.975>], model_id='41bd5a16-25a6-447b-90e0-0f7b7e5cb6cf', model_type='', model_uri='models:/41bd5a16-25a6-447b-90e0-0f7b7e5cb6cf', name='torch-iris-100', params={'activation': 'ReLU',
+#  'criterion': 'CrossEntropyLoss',
+#  'n_layers': '3',
+#  'optimizer': 'Adam'}, source_run_id='12f143a7fda1461e9240d7ffad4ea5bd', status=<LoggedModelStatus.READY: 'READY'>, status_message='', tags={'mlflow.source.git.commit': '7324c807f07a1766d4b951733e3d723504b4576e',
+#  'mlflow.source.name': 'a.py',
+#  'mlflow.source.type': 'LOCAL',
+#  'mlflow.user': 'serena.ruan'}>
+# [<Metric: dataset_digest='1f1c13b5', dataset_name='train', key='accuracy', model_id='41bd5a16-25a6-447b-90e0-0f7b7e5cb6cf', run_id='12f143a7fda1461e9240d7ffad4ea5bd', step=100, timestamp=1743734075029, value=0.975>]
+
+worst_checkpoint = ranked_checkpoints[-1]
+print(f"Worst model: {worst_checkpoint}")
+print(worst_checkpoint.metrics)
+
+# Worst model: <LoggedModel: artifact_location='file:///Users/serena.ruan/Documents/repos/mlflow-3-doc/mlruns/0/models/0d789084-9a3b-4b85-9d43-6a148c014b7e/artifacts', creation_timestamp=1743734016290, experiment_id='0', last_updated_timestamp=1743734022728, metrics=[<Metric: dataset_digest='1f1c13b5', dataset_name='train', key='accuracy', model_id='0d789084-9a3b-4b85-9d43-6a148c014b7e', run_id='12f143a7fda1461e9240d7ffad4ea5bd', step=0, timestamp=1743734022737, value=0.3>], model_id='0d789084-9a3b-4b85-9d43-6a148c014b7e', model_type='', model_uri='models:/0d789084-9a3b-4b85-9d43-6a148c014b7e', name='torch-iris-0', params={}, source_run_id='12f143a7fda1461e9240d7ffad4ea5bd', status=<LoggedModelStatus.READY: 'READY'>, status_message='', tags={'mlflow.source.git.commit': '7324c807f07a1766d4b951733e3d723504b4576e',
+#  'mlflow.source.name': 'a.py',
+#  'mlflow.source.type': 'LOCAL',
+#  'mlflow.user': 'serena.ruan'}>
+# [<Metric: dataset_digest='1f1c13b5', dataset_name='train', key='accuracy', model_id='0d789084-9a3b-4b85-9d43-6a148c014b7e', run_id='12f143a7fda1461e9240d7ffad4ea5bd', step=0, timestamp=1743734022737, value=0.3>]
+
diff --git a/ceph-s3/s3-demo.py b/ceph-s3/s3-demo.py
@@ -0,0 +1,81 @@
+import boto3
+import pandas as pd
+import yaml
+import mlflow 
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier 
+from sklearn.metrics import accuracy_score 
+
+with open("config.yaml") as f:
+    config = yaml.safe_load(f)
+
+aws_access_key_id = config["aws_access_key_id"]
+aws_secret_access_key = config["aws_secret_access_key"]
+bucket_name = config["bucket_name"]
+object_name = config["object_name"]
+local_file = "wine-quality.csv"
+endpoint_url = config.get("endpoint_url")
+
+s3_client = boto3.client(
+    "s3",
+    aws_access_key_id=aws_access_key_id,
+    aws_secret_access_key=aws_secret_access_key,
+    region_name="us-east-1",   # optional if needed
+    endpoint_url=endpoint_url
+)
+
+print(f"Downloading {object_name} from bucket {bucket_name}...")
+s3_client.download_file(bucket_name, object_name, local_file)
+
+df = pd.read_csv(local_file)
+print("Dataset shape:", df.shape)
+print(df.head())
+
+X = df.drop("quality", axis=1)
+y = df["quality"]
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+mlflow.set_tracking_uri("https://ard-mlflow.slac.stanford.edu")
+mlflow.set_experiment("S3 Boto3 Demo")
+mlflow.sklearn.autolog()
+
+with mlflow.start_run() as run:
+    model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
+    model.fit(X_train, y_train)
+
+    preds = model.predict(X_test)
+    acc = accuracy_score(y_test, preds)
+
+    mlflow.log_metric("accuracy", acc)
+
+    input_example = pd.DataFrame(X_train.iloc[:1])
+
+    mlflow.sklearn.log_model(
+        sk_model=model,
+        name="model",
+        registered_model_name="wine_quality_model",
+        input_example=input_example
+    )
+
+    print("Run ID:", run.info.run_id)
+    print("Accuracy logged:", acc)
+    print("Model registered as: wine_quality_model")
+
+model_uri = "models:/wine_quality_model/7"
+loaded_model = mlflow.pyfunc.load_model(model_uri)
+#model_uri = "mlartifacts/0/models/m-ce23c6a1b14b41a091e6c0b549122d0c/artifacts"
+#model = mlflow.pyfunc.load_model(model_uri)
+run_id = loaded_model.metadata.run_id
+print(run_id)
+
+sample = X_test.iloc[:5]
+preds_loaded = loaded_model.predict(sample)
+print("Sample predictions from registry:", preds_loaded)
+
+df_compare = pd.DataFrame({
+    "Actual": y_test.iloc[:5].values,
+    "Predicted": preds_loaded}
+        )
+print("\nComparison of actual vs predictions:")
+print(df_compare)
diff --git a/ceph-s3/upload.py b/ceph-s3/upload.py
@@ -0,0 +1,22 @@
+import boto3
+
+aws_access_key_id = ""
+aws_secret_access_key = ""
+bucket_name = "mlflow-backup"
+file_path = "wine-quality.csv"       # Local file to upload
+object_name = "wine-quality.csv"    # How it will be named in S3
+
+# If you have a custom S3 endpoint, pass endpoint_url="https://s3.yourprovider.com"
+s3_client = boto3.client(
+    "s3",
+    aws_access_key_id=aws_access_key_id,
+    aws_secret_access_key=aws_secret_access_key,
+    region_name="",   # optional if needed
+    endpoint_url=""
+)
+
+# Upload the file
+s3_client.upload_file(file_path, bucket_name, object_name)
+
+print(f"Uploaded {file_path} to s3://{bucket_name}/{object_name}")
+
diff --git a/s3.ini b/s3.ini
@@ -0,0 +1,11 @@
+host_base = 
+host_bucket = 
+bucket_location = us-east-1
+use_https = True
+
+# Setup access keys
+access_key = 
+secret_key =
+
+# Enable S3 v4 signature APIs
+signature_v2 = false