whizzml · mmerce · May 23, 2025 · jaor · May 23, 2025 · jaor
diff --git a/cross-validation/metadata.json b/cross-validation/metadata.json
@@ -2,5 +2,5 @@
   "name": "Cross validation",
   "description": "A collection of whizzml scripts and libraries performing k-fold cross-validation",
   "kind": "package",
-  "components": ["cross-validation-gen", "basic", "model", "ensemble", "logistic-regression", "boosted-ensemble", "deepnet", "supervised-conf", "linear-regression"]
+  "components": ["cross-validation-gen", "basic", "model", "ensemble", "logistic-regression", "boosted-ensemble", "deepnet", "supervised-conf", "linear-regression", "purged-cross-validation"]
 }
diff --git a/cross-validation/purged-cross-validation/metadata.json b/cross-validation/purged-cross-validation/metadata.json
@@ -0,0 +1,6 @@
+{
+  "name": "Purged k-fold cross-validation script",
+  "description": "The objective of this script is performing a purged k-fold cross validation of any supervised model built from a time-series like already sorted dataset. The algorithm:\n\n - Divides the dataset in k parts\n - Holds out the data in one of the parts and builds a supervised model\n with the rest of data\n - Removes the edges of the hold out dataset to create the test dataset (avoiding leakage).\n- Evaluates the supervised model with the purged test dataset\n - The second, third and fourth steps are repeated with each of the k parts, so that\n k evaluations are generated\n - Finally, the evaluation metrics are averaged to provide the cross-validation\n metrics.\n\n The **goal** of the script is producing a\n cross-validation, an evaluation whose metrics are averages of the k evaluations\n created in the cross-validation process.\n\n For more information, please see the [readme](https://github.com/whizzml/examples/tree/master/cross-validation/purged-cross-validation).",
+  "kind": "script",
+  "source_code": "script.whizzml"
+}
diff --git a/cross-validation/purged-cross-validation/readme.md b/cross-validation/purged-cross-validation/readme.md
@@ -0,0 +1,18 @@
+# Script for purged k-fold cross-validation
+
+The objective of this script is create a purged k-fold cross validation
+starting form any classification model
+built from a time-series kind of dataset that has been previously ordered.
+
+The algorithm:
+
+- Divides the dataset in k parts
+- Holds out the data in one of the parts and builds the same supervised model
+  used as input with the rest of data
+- Creates a test dataset by purging its edges (15% of the hold out data) to
+  avoid leakage.
+- Evaluates the model with the test data
+- The second, third and fourth steps are repeated with each of the k parts,
+  so that k evaluations are generated
+- Finally, the evaluation metrics are averaged to provide the cross-validation
+  metrics.