diff --git a/CHANGELOG.md b/CHANGELOG.md index f0b68043..1420fdda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,8 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `load_geojson` - `load_ml_model` - `load_url` - - `ml_fit_class_random_forest` - - `ml_fit_regr_random_forest` + - `ml_fit` - `ml_predict` - `save_ml_model` - `unflatten_dimension` diff --git a/proposals/load_ml_model.json b/proposals/load_ml_model.json index 7fa86d89..50533d04 100644 --- a/proposals/load_ml_model.json +++ b/proposals/load_ml_model.json @@ -1,7 +1,7 @@ { "id": "load_ml_model", "summary": "Load a ML model", - "description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as ``ml_fit_regr_random_forest()`` and ``save_ml_model()``.", + "description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as ``ml_fit()`` and ``save_ml_model()``.", "categories": [ "machine learning", "import" @@ -43,4 +43,4 @@ "rel": "about" } ] -} +} \ No newline at end of file diff --git a/proposals/ml_fit.json b/proposals/ml_fit.json new file mode 100644 index 00000000..c43634f0 --- /dev/null +++ b/proposals/ml_fit.json @@ -0,0 +1,89 @@ +{ + "id": "ml_fit", + "summary": "Train a machine learning or deep learning model", + "description": "Fit a machine learning or deep learning model to training data. It can be used for both classification and regression tasks, determined by the nature of the labels.", + "categories": [ + "machine learning", + "deep learning" + ], + "experimental": true, + "parameters": [ + { + "name": "predictors", + "description": "The predictors for the model as a vector data cube. These are the independent variables that the algorithm analyses to learn patterns and relationships within the data.", + "schema": [ + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + }, + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "other" + } + ] + } + ] + }, + { + "name": "target", + "description": "The dependent variable for the model. These are the labeled data, aligning with predictor values based on a shared geometry dimension. This ensures a clear connection between predictor rows and labels.", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + } + ] + } + }, + { + "name": "ml_method", + "description": "The machine learning method to be used for fitting the model.", + "schema": { + "type": "string", + "enum": [ + "cnn", + "cnn_lstm", + "mlp", + "random_forest", + "resnet", + "svm", + "tempcnn", + "xgboost" + ], + "default": "random_forest" + } + }, + { + "name": "parameters", + "description": "Additional parameters for the chosen machine learning method, passed as an object.", + "schema": { + "type": "object" + }, + "default": null, + "optional": true + } + ], + "returns": { + "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + } +} \ No newline at end of file diff --git a/proposals/ml_fit_class_random_forest.json b/proposals/ml_fit_class_random_forest.json deleted file mode 100644 index 63da48a1..00000000 --- a/proposals/ml_fit_class_random_forest.json +++ /dev/null @@ -1,110 +0,0 @@ -{ - "id": "ml_fit_class_random_forest", - "summary": "Train a random forest classification model", - "description": "Executes the fit of a random forest classification based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest classification model is based on the approach by Breiman (2001).", - "categories": [ - "machine learning" - ], - "experimental": true, - "parameters": [ - { - "name": "predictors", - "description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", - "schema": [ - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "bands" - } - ] - }, - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "other" - } - ] - } - ] - }, - { - "name": "target", - "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", - "schema": { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - } - ] - } - }, - { - "name": "max_variables", - "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split.\n- `sqrt`: The square root of the number of variables are considered for each split. This is often the default for classification.", - "schema": [ - { - "type": "integer", - "minimum": 1 - }, - { - "type": "string", - "enum": [ - "all", - "log2", - "onethird", - "sqrt" - ] - } - ] - }, - { - "name": "num_trees", - "description": "The number of trees build within the Random Forest classification.", - "optional": true, - "default": 100, - "schema": { - "type": "integer", - "minimum": 1 - } - }, - { - "name": "seed", - "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", - "optional": true, - "default": null, - "schema": { - "type": [ - "integer", - "null" - ] - } - } - ], - "returns": { - "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", - "schema": { - "type": "object", - "subtype": "ml-model" - } - }, - "links": [ - { - "href": "https://doi.org/10.1023/A:1010933404324", - "title": "Breiman (2001): Random Forests", - "type": "text/html", - "rel": "about" - } - ] -} diff --git a/proposals/ml_fit_regr_random_forest.json b/proposals/ml_fit_regr_random_forest.json deleted file mode 100644 index 39207324..00000000 --- a/proposals/ml_fit_regr_random_forest.json +++ /dev/null @@ -1,110 +0,0 @@ -{ - "id": "ml_fit_regr_random_forest", - "summary": "Train a random forest regression model", - "description": "Executes the fit of a random forest regression based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest regression model is based on the approach by Breiman (2001).", - "categories": [ - "machine learning" - ], - "experimental": true, - "parameters": [ - { - "name": "predictors", - "description": "The predictors for the regression model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", - "schema": [ - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "bands" - } - ] - }, - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "other" - } - ] - } - ] - }, - { - "name": "target", - "description": "The training sites for the regression model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", - "schema": { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - } - ] - } - }, - { - "name": "max_variables", - "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split. This is often the default for regression.\n- `sqrt`: The square root of the number of variables are considered for each split.", - "schema": [ - { - "type": "integer", - "minimum": 1 - }, - { - "type": "string", - "enum": [ - "all", - "log2", - "onethird", - "sqrt" - ] - } - ] - }, - { - "name": "num_trees", - "description": "The number of trees build within the Random Forest regression.", - "optional": true, - "default": 100, - "schema": { - "type": "integer", - "minimum": 1 - } - }, - { - "name": "seed", - "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", - "optional": true, - "default": null, - "schema": { - "type": [ - "integer", - "null" - ] - } - } - ], - "returns": { - "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", - "schema": { - "type": "object", - "subtype": "ml-model" - } - }, - "links": [ - { - "href": "https://doi.org/10.1023/A:1010933404324", - "title": "Breiman (2001): Random Forests", - "type": "text/html", - "rel": "about" - } - ] -} diff --git a/proposals/ml_predict.json b/proposals/ml_predict.json index 87cd2500..b1cbf545 100644 --- a/proposals/ml_predict.json +++ b/proposals/ml_predict.json @@ -17,7 +17,7 @@ }, { "name": "model", - "description": "A ML model that was trained with one of the ML training processes such as ``ml_fit_regr_random_forest()``.", + "description": "A ML model that was trained with the ML training process ``ml_fit()``.", "schema": { "type": "object", "subtype": "ml-model" @@ -46,4 +46,4 @@ ] } } -} +} \ No newline at end of file