Skip to content

ml module

Module for machine learning with Google Earth Engine.

csv_to_classifier(in_csv)

Convert a CSV file containing a list of strings (an ensemble of decision trees) to an ee.Classifier.

Parameters:

Name Type Description Default
in_csv str

File path to the input CSV.

required
Source code in geemap/ml.py
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
def csv_to_classifier(in_csv):
    """Convert a CSV file containing a list of strings (an ensemble of decision trees) to an ee.Classifier.

    Args:
        in_csv (str): File path to the input CSV.
    Returns:
        object: ee.Classifier.
    """

    in_csv = os.path.join(in_csv)

    try:
        with open(in_csv) as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"{in_csv} could not be found.")
        return None

    null_island = ee.Geometry.Point([0, 0])
    features = [ee.Feature(null_island, {"tree": line.strip()}) for line in lines]
    rf_fc = ee.FeatureCollection(features)
    classifier = fc_to_classifier(rf_fc)

    return classifier

export_trees_to_fc(trees, asset_id, description='geemap_rf_export')

Function that creates a feature collection with a property tree which contains the string representation of decision trees and exports to ee asset for later use

Parameters:

Name Type Description Default
trees list[str]

list of string representation of the decision trees

required
asset_id str

ee asset id path to export the feature collection to

required
kwargs

description (str): optional description to provide export information. default = "geemap_rf_export"

Source code in geemap/ml.py
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
def export_trees_to_fc(trees, asset_id, description="geemap_rf_export"):
    """Function that creates a feature collection with a property tree which contains the string representation of decision trees and exports to ee asset for later use

    args:
        trees (list[str]): list of string representation of the decision trees
        asset_id (str): ee asset id path to export the feature collection to

    kwargs:
        description (str): optional description to provide export information. default = "geemap_rf_export"

    """
    # create a null geometry point. This is needed to properly export the feature collection
    null_island = ee.Geometry.Point([0, 0])

    # create a list of feature over null island
    # set the tree property as the tree string
    # encode return values (\n) as #, use to parse later
    features = [
        ee.Feature(null_island, {"tree": tree.replace("\n", "#")}) for tree in trees
    ]
    # cast as feature collection
    fc = ee.FeatureCollection(features)

    # get export task and start
    task = ee.batch.Export.table.toAsset(
        collection=fc, description=description, assetId=asset_id
    )
    task.start()

fc_to_classifier(fc)

Function that takes a feature collection resulting from export_trees_to_fc and creates a ee.Classifier that can be used with ee objects

Parameters:

Name Type Description Default
fc FeatureCollection

feature collection that has trees property for each feature that represents the decision tree

required

Returns:

Name Type Description
classifier Classifier

ee classifier object representing an ensemble decision tree

Source code in geemap/ml.py
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
def fc_to_classifier(fc):
    """Function that takes a feature collection resulting from `export_trees_to_fc` and creates a ee.Classifier that can be used with ee objects

    args:
        fc (ee.FeatureCollection): feature collection that has trees property for each feature that represents the decision tree

    returns:
        classifier (ee.Classifier): ee classifier object representing an ensemble decision tree

    """

    # get a list of tree strings from feature collection
    tree_strings = fc.aggregate_array("tree").map(
        lambda x: ee.String(x).replace(
            "#", "\n", "g"
        )  # expects that # is ecoded to be a return
    )
    # pass list of ee.Strings to an ensemble decision tree classifier (i.e. RandomForest)
    classifier = ee.Classifier.decisionTreeEnsemble(tree_strings)

    return classifier

rf_to_strings(estimator, feature_names, processes=2, output_mode='INFER')

Function to convert a ensemble of decision trees into a list of strings. Wraps tree_to_string

Parameters:

Name Type Description Default
estimator estimator

A decision tree classifier or regressor object created using sklearn

required
feature_names list[str]

List of strings that define the name of features (i.e. bands) used to create the model

required
processes int

number of cpu processes to spawn. Increasing processes will improve speed for large models. default = 2

2
output_mode str

the output mode of the estimator. Options are "INFER", "CLASSIFIATION", or "REGRESSION" (capitalization does not matter). default = "INFER"

'INFER'

Returns:

Name Type Description
trees list[str]

list of strings where each string represents a decision tree estimator and collectively represent an ensemble decision tree estimator (i.e. RandomForest)

Source code in geemap/ml.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def rf_to_strings(estimator, feature_names, processes=2, output_mode="INFER"):
    """Function to convert a ensemble of decision trees into a list of strings. Wraps `tree_to_string`

    args:
        estimator (sklearn.ensemble.estimator): A decision tree classifier or regressor object created using sklearn
        feature_names (list[str]): List of strings that define the name of features (i.e. bands) used to create the model
        processes (int): number of cpu processes to spawn. Increasing processes will improve speed for large models. default = 2
        output_mode (str): the output mode of the estimator. Options are "INFER", "CLASSIFIATION", or "REGRESSION" (capitalization does not matter). default = "INFER"

    returns:
        trees (list[str]): list of strings where each string represents a decision tree estimator and collectively represent an ensemble decision tree estimator (i.e. RandomForest)

    """

    # force output mode to be capital
    output_mode = output_mode.upper()

    available_modes = ["INFER", "CLASSIFICATION", "REGRESSION", "PROBABILITY"]

    if output_mode not in available_modes:
        raise ValueError(
            f"The provided output_mode is not available, please provide one from the following list: {available_modes}"
        )

    # extract out the estimator trees
    estimators = np.squeeze(estimator.estimators_)

    if output_mode == "INFER":
        if estimator.criterion in ["gini", "entropy"]:
            class_labels = estimator.classes_
        elif estimator.criterion in ["mse", "mae"]:
            class_labels = None
        else:
            raise RuntimeError(
                "Could not infer the output type from the estimator, please explicitly provide the output_mode "
            )

    elif output_mode == "CLASSIFICATION":
        class_labels = estimator.classes_

    else:
        class_labels = None

    # check that number of processors set to use is not more than available
    if processes >= mp.cpu_count():
        # if so, force to use only cpu count - 1
        processes = mp.cpu_count() - 1

    # run the tree extraction process in parallel
    with mp.Pool(processes) as pool:
        proc = pool.map_async(
            partial(
                tree_to_string,
                feature_names=feature_names,
                labels=class_labels,
                output_mode=output_mode,
            ),
            estimators,
        )
        trees = list(proc.get())

    return trees

strings_to_classifier(trees)

Function that takes string representation of decision trees and creates a ee.Classifier that can be used with ee objects

Parameters:

Name Type Description Default
trees list[str]

list of string representation of the decision trees

required

Returns:

Name Type Description
classifier Classifier

ee classifier object representing an ensemble decision tree

Source code in geemap/ml.py
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
def strings_to_classifier(trees):
    """Function that takes string representation of decision trees and creates a ee.Classifier that can be used with ee objects

    args:
        trees (list[str]): list of string representation of the decision trees

    returns:
        classifier (ee.Classifier): ee classifier object representing an ensemble decision tree

    """

    # convert strings to ee.String objects
    ee_strings = [ee.String(tree) for tree in trees]

    # pass list of ee.Strings to an ensemble decision tree classifier (i.e. RandomForest)
    classifier = ee.Classifier.decisionTreeEnsemble(ee_strings)

    return classifier

tree_to_string(estimator, feature_names, labels=None, output_mode='INFER')

Function to convert a sklearn decision tree object to a string format that EE can interpret

Parameters:

Name Type Description Default
estimator estimator

An estimator consisting of multiple decision tree classifiers. Expects object to contain estimators_ attribute

required
feature_names Iterable[str]

List of strings that define the name of features (i.e. bands) used to create the model

required
labels Iterable[numeric]

List of class labels to map outputs to, must be numeric values. If None, then raw outputs will be used. default = None

None
output_mode str

the output mode of the estimator. Options are "INFER", "CLASSIFIATION", or "REGRESSION" (capitalization does not matter). default = "INFER"

'INFER'

Returns:

Name Type Description
tree_str str

string representation of decision tree estimator

Raises:

Type Description
RuntimeError

raises run time error when function cannot determine if the estimator is for regression or classification problem

Source code in geemap/ml.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def tree_to_string(estimator, feature_names, labels=None, output_mode="INFER"):
    """Function to convert a sklearn decision tree object to a string format that EE can interpret

    args:
        estimator (sklearn.tree.estimator): An estimator consisting of multiple decision tree classifiers. Expects object to contain estimators_ attribute
        feature_names (Iterable[str]): List of strings that define the name of features (i.e. bands) used to create the model
        labels (Iterable[numeric]): List of class labels to map outputs to, must be numeric values. If None, then raw outputs will be used. default = None
        output_mode (str): the output mode of the estimator. Options are "INFER", "CLASSIFIATION", or "REGRESSION" (capitalization does not matter). default = "INFER"

    returns:
        tree_str (str): string representation of decision tree estimator

    raises:
        RuntimeError: raises run time error when function cannot determine if the estimator is for regression or classification problem
    """

    # extract out the information need to build the tree string
    n_nodes = estimator.tree_.node_count
    children_left = estimator.tree_.children_left
    children_right = estimator.tree_.children_right
    feature_idx = estimator.tree_.feature
    impurities = estimator.tree_.impurity
    n_samples = estimator.tree_.n_node_samples
    thresholds = estimator.tree_.threshold
    features = [feature_names[i] for i in feature_idx]

    raw_vals = np.squeeze(estimator.tree_.value)

    # first check if user wants to infer output mode
    # if so, reset the output_mode variable to a valid mode
    if output_mode == "INFER":
        if raw_vals.ndim == 2:
            output_mode = "CLASSIFICATION"

        elif raw_vals.ndim == 1:
            output_mode = "REGRESSION"

        else:
            raise RuntimeError(
                "Could not infer the output type from the estimator, please explicitly provide the output_mode "
            )

    # second check on the output mode after the inference
    if output_mode == "CLASSIFICATION":
        # take argmax along class axis from values
        values = raw_vals.argmax(axis=-1)
        if labels is not None:
            index_labels = np.unique(values)
            lookup = {idx: labels[i] for i, idx in enumerate(index_labels)}
            values = [lookup[v] for v in values]

        out_type = int

    elif output_mode == "REGRESSION":
        # take values and drop un needed axis
        values = np.around(raw_vals, decimals=6)
        out_type = float

    elif output_mode == "PROBABILITY":
        # calculate fraction of samples of the same class in a leaf
        # currently only supporting binary classifications
        # check if n classes == 2 (i.e. binary classes)
        if raw_vals.shape[-1] != 2:
            raise ValueError(
                "shape mismatch: outputs from trees = {raw_vals.shape[-1]} classes, currently probability outputs is support for binary classifications"
            )

        probas = np.around(
            (raw_vals / np.sum(raw_vals, axis=1)[:, np.newaxis]), decimals=6
        )

        values = probas[:, -1]
        out_type = float

    elif output_mode == "MULTIPROBABILITY":
        # calculate fraction of samples of the same class in a leaf
        # this is a 2-d array making the output multidimensional
        raise NotImplementedError(
            "Currently multiprobability output is not support, please choose one of the following output modes: ['CLASSIFIATION', 'REGRESSION', 'PROBABILITY' or 'INFER']"
        )

        # probas = np.around(
        #     (raw_vals / np.sum(raw_vals,axis=1)[:,np.newaxis]),
        #     decimals=6
        # )

        # values = probas.tolist()
        # out_type = list

    else:
        raise RuntimeError(
            "could not understand estimator type and parse out the values"
        )

    # use iterative pre-order search to extract node depth and leaf information
    node_ids = np.zeros(shape=n_nodes, dtype=np.int64)
    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1
        node_ids[node_id] = node_id

        # If we have a test node
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    # create a table of the initial structure
    # each row is a node or leaf
    df = pd.DataFrame(
        {
            "node_id": node_ids,
            "node_depth": node_depth,
            "is_leaf": is_leaves,
            "children_left": children_left,
            "children_right": children_right,
            "value": values,
            "criterion": impurities,
            "n_samples": n_samples,
            "threshold": thresholds,
            "feature_name": features,
            "sign": ["<="] * n_nodes,
        },
        dtype="object",
    )

    # the table representation does not have lef vs right node structure
    # so we need to add in right nodes in the correct location
    # we do this by first calculating which nodes are right and then insert them at the correct index

    # get a dict of right node rows and assign key based on index where to insert
    inserts = {}
    for row in df.itertuples():
        child_r = row.children_right
        if child_r > row.Index:
            ordered_row = np.array(row)
            ordered_row[-1] = ">"
            inserts[child_r] = ordered_row[1:]  # drop index value
    # sort the inserts as to keep track of the additive indexing
    inserts_sorted = {k: inserts[k] for k in sorted(inserts.keys())}

    # loop through the row inserts and add to table (array)
    table_values = df.values
    for i, k in enumerate(inserts_sorted.keys()):
        table_values = np.insert(table_values, (k + i), inserts_sorted[k], axis=0)

    # make the ordered table array into a dataframe
    # note: df is dtype "object", need to cast later on
    ordered_df = pd.DataFrame(table_values, columns=df.columns)

    max_depth = np.max(ordered_df.node_depth.astype(int))
    tree_str = f"1) root {n_samples[0]} 9999 9999 ({impurities.sum()})\n"
    previous_depth = -1
    cnts = []
    # loop through the nodes and calculate the node number and values per node
    for row in ordered_df.itertuples():
        node_depth = int(row.node_depth)
        left = int(row.children_left)
        right = int(row.children_right)
        if left != right:
            if row.Index == 0:
                cnt = 2
            elif previous_depth > node_depth:
                depths = ordered_df.node_depth.values[: row.Index]
                idx = np.where(depths == node_depth)[0][-1]
                # cnt = (cnts[row.Index-1] // 2) + 1
                cnt = cnts[idx] + 1
            elif previous_depth < node_depth:
                cnt = cnts[row.Index - 1] * 2
            elif previous_depth == node_depth:
                cnt = cnts[row.Index - 1] + 1

            if node_depth == (max_depth - 1):
                value = out_type(ordered_df.iloc[row.Index + 1].value)
                samps = int(ordered_df.iloc[row.Index + 1].n_samples)
                criterion = float(ordered_df.iloc[row.Index + 1].criterion)
                tail = " *\n"
            else:
                if (
                    (bool(ordered_df.loc[ordered_df.node_id == left].iloc[0].is_leaf))
                    and (
                        bool(
                            int(row.Index)
                            < int(ordered_df.loc[ordered_df.node_id == left].index[0])
                        )
                    )
                    and (str(row.sign) == "<=")
                ):
                    rowx = ordered_df.loc[ordered_df.node_id == left].iloc[0]
                    tail = " *\n"
                    value = out_type(rowx.value)
                    samps = int(rowx.n_samples)
                    criterion = float(rowx.criterion)

                elif (
                    (bool(ordered_df.loc[ordered_df.node_id == right].iloc[0].is_leaf))
                    and (
                        bool(
                            int(row.Index)
                            < int(ordered_df.loc[ordered_df.node_id == right].index[0])
                        )
                    )
                    and (str(row.sign) == ">")
                ):
                    rowx = ordered_df.loc[ordered_df.node_id == right].iloc[0]
                    tail = " *\n"
                    value = out_type(rowx.value)
                    samps = int(rowx.n_samples)
                    criterion = float(rowx.criterion)

                else:
                    value = out_type(row.value)
                    samps = int(row.n_samples)
                    criterion = float(row.criterion)
                    tail = "\n"

            # extract out the information needed in each line
            spacing = (node_depth + 1) * "  "  # for pretty printing
            fname = str(row.feature_name)  # name of the feature (i.e. band name)
            tresh = float(row.threshold)  # threshold
            sign = str(row.sign)

            tree_str += f"{spacing}{cnt}) {fname} {sign} {tresh:.6f} {samps} {criterion:.4f} {value}{tail}"
            previous_depth = node_depth
        cnts.append(cnt)

    return tree_str

trees_to_csv(trees, out_csv)

Save a list of strings (an ensemble of decision trees) to a CSV file.

Parameters:

Name Type Description Default
trees list

A list of strings (an ensemble of decision trees).

required
out_csv str

File path to the output csv

required
Source code in geemap/ml.py
387
388
389
390
391
392
393
394
395
396
def trees_to_csv(trees, out_csv):
    """Save a list of strings (an ensemble of decision trees) to a CSV file.

    Args:
        trees (list): A list of strings (an ensemble of decision trees).
        out_csv (str): File path to the output csv
    """
    out_csv = os.path.abspath(out_csv)
    with open(out_csv, "w") as f:
        f.writelines([tree.replace("\n", "#") + "\n" for tree in trees])