Skip to content

bgc_data_processing.core.filtering

Extract data from storers with given conditions.

Constraints()

Slicer object to slice dataframes.

Initiate slicer object to slice dataframes.

Source code in src/bgc_data_processing/core/filtering.py
20
21
22
23
24
25
def __init__(self) -> None:
    """Initiate slicer object to slice dataframes."""
    self.boundaries: dict[str, dict[str, int | float | datetime]] = {}
    self.supersets: dict[str, list] = {}
    self.constraints: dict[str, Callable] = {}
    self.polygons: list[dict[str, str | Polygon]] = []

boundaries: dict[str, dict[str, int | float | datetime]] = {} instance-attribute

supersets: dict[str, list] = {} instance-attribute

constraints: dict[str, Callable] = {} instance-attribute

polygons: list[dict[str, str | Polygon]] = [] instance-attribute

reset()

Reset all defined constraints.

Source code in src/bgc_data_processing/core/filtering.py
27
28
29
30
31
def reset(self) -> None:
    """Reset all defined constraints."""
    self.boundaries = {}
    self.supersets = {}
    self.constraints = {}

add_boundary_constraint(field_label, minimal_value=np.nan, maximal_value=np.nan)

Add a constraint of type 'boundary'.

Parameters:

Name Type Description Default
field_label str

Name of the column to apply the constraint to.

required
minimal_value int | float | datetime

Minimum value for the column., by default np.nan

nan
maximal_value int | float | datetime

Maximum value for the column., by default np.nan

nan
Source code in src/bgc_data_processing/core/filtering.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def add_boundary_constraint(
    self,
    field_label: str,
    minimal_value: int | float | datetime = np.nan,
    maximal_value: int | float | datetime = np.nan,
) -> None:
    """Add a constraint of type 'boundary'.

    Parameters
    ----------
    field_label : str
        Name of the column to apply the constraint to.
    minimal_value : int | float | datetime, optional
        Minimum value for the column., by default np.nan
    maximal_value : int | float | datetime, optional
        Maximum value for the column., by default np.nan
    """
    is_min_nan = isinstance(minimal_value, float) and np.isnan(minimal_value)
    is_max_nan = isinstance(maximal_value, float) and np.isnan(maximal_value)
    if not (is_min_nan and is_max_nan):
        self.boundaries[field_label] = {
            "min": minimal_value,
            "max": maximal_value,
        }

add_superset_constraint(field_label, values_superset=None)

Add a constrainte of type 'superset'.

Parameters:

Name Type Description Default
field_label str

Name of the column to apply the constraint to.

required
values_superset list[Any] | None

All the values that the column can take. If empty, no constraint will be applied., by default None

None
Source code in src/bgc_data_processing/core/filtering.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def add_superset_constraint(
    self,
    field_label: str,
    values_superset: list[Any] | None = None,
) -> None:
    """Add a constrainte of type 'superset'.

    Parameters
    ----------
    field_label : str
        Name of the column to apply the constraint to.
    values_superset : list[Any] | None
        All the values that the column can take.
        If empty, no constraint will be applied., by default None
    """
    if values_superset is None:
        values_superset = []
    if values_superset:
        self.supersets[field_label] = values_superset

add_polygon_constraint(latitude_field, longitude_field, polygon)

Add a polygon constraint.

Parameters:

Name Type Description Default
latitude_field str

Name of the latitude-related field.

required
longitude_field str

Name of the longitude-related field.

required
polygon Polygon

Polygon to use as boundary.

required
Source code in src/bgc_data_processing/core/filtering.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def add_polygon_constraint(
    self,
    latitude_field: str,
    longitude_field: str,
    polygon: Polygon,
) -> None:
    """Add a polygon constraint.

    Parameters
    ----------
    latitude_field : str
        Name of the latitude-related field.
    longitude_field : str
        Name of the longitude-related field.
    polygon : Polygon
        Polygon to use as boundary.
    """
    constraint_dict = {
        "latitude_field": latitude_field,
        "longitude_field": longitude_field,
        "polygon": polygon,
    }
    self.polygons.append(constraint_dict)

apply_constraints_to_storer(storer)

Apply all constraints to a DataFrame.

The index of the previous Storer's dataframe are conserved.

Parameters:

Name Type Description Default
storer DataFrame

Storer to apply the constraints to.

required

Returns:

Type Description
Storer

New storer with equivalent paramters and updated data.

Source code in src/bgc_data_processing/core/filtering.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def apply_constraints_to_storer(self, storer: Storer) -> Storer:
    """Apply all constraints to a DataFrame.

    The index of the previous Storer's dataframe are conserved.

    Parameters
    ----------
    storer : pd.DataFrame
        Storer to apply the constraints to.

    Returns
    -------
    Storer
        New storer with equivalent paramters and updated data.
    """
    return Storer(
        data=self.apply_constraints_to_dataframe(storer.data),
        category=storer.category,
        providers=storer.providers,
        variables=storer.variables,
    )

apply_constraints_to_dataframe(dataframe)

Apply all constraints to a DataFrame.

This slice conserves indexes values.

Parameters:

Name Type Description Default
dataframe DataFrame

DataFrame to apply the constraints to.

required

Returns:

Type Description
DataFrame

DataFrame whose rows verify all constraints or None if inplace=True.

Source code in src/bgc_data_processing/core/filtering.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def apply_constraints_to_dataframe(
    self,
    dataframe: pd.DataFrame,
) -> pd.DataFrame | None:
    """Apply all constraints to a DataFrame.

    This slice conserves indexes values.

    Parameters
    ----------
    dataframe : pd.DataFrame
        DataFrame to apply the constraints to.

    Returns
    -------
    pd.DataFrame
        DataFrame whose rows verify all constraints or None if inplace=True.
    """
    bool_boundaries = self._apply_boundary_constraints(dataframe)
    bool_supersets = self._apply_superset_constraints(dataframe)
    bool_polygons = self._apply_polygon_constraints(dataframe)
    verify_all = bool_boundaries & bool_supersets & bool_polygons
    return dataframe.loc[verify_all, :]

apply_specific_constraint(field_label, df)

Only apply a single constraint.

Parameters:

Name Type Description Default
field_label str

Label of the field to apply the constraint to.

required
df DataFrame

DataFrame to apply the constraints to.

required

Returns:

Type Description
DataFrame | None

DataFrame whose rows verify all constraints or None if inplace=True.

Source code in src/bgc_data_processing/core/filtering.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
def apply_specific_constraint(
    self,
    field_label: str,
    df: pd.DataFrame,
) -> pd.DataFrame | None:
    """Only apply a single constraint.

    Parameters
    ----------
    field_label : str
        Label of the field to apply the constraint to.
    df : pd.DataFrame
        DataFrame to apply the constraints to.

    Returns
    -------
    pd.DataFrame | None
        DataFrame whose rows verify all constraints or None if inplace=True.
    """
    constraint = Constraints()
    if field_label in self.boundaries:
        constraint.add_boundary_constraint(
            field_label=field_label,
            minimal_value=self.boundaries[field_label]["min"],
            maximal_value=self.boundaries[field_label]["max"],
        )
    if field_label in self.supersets:
        constraint.add_superset_constraint(
            field_label=field_label,
            value_superset=self.supersets[field_label],
        )
    return constraint.apply_constraints_to_dataframe(dataframe=df)

is_constrained(field_name)

Return True if 'field_name' is constrained.

Parameters:

Name Type Description Default
field_name str

Field to name to test the constraint.

required

Returns:

Type Description
bool

True if the field has a constraint.

Source code in src/bgc_data_processing/core/filtering.py
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
def is_constrained(self, field_name: str) -> bool:
    """Return True if 'field_name' is constrained.

    Parameters
    ----------
    field_name : str
        Field to name to test the constraint.

    Returns
    -------
    bool
        True if the field has a constraint.
    """
    in_boundaries = field_name in self.boundaries
    in_supersets = field_name in self.supersets
    return in_boundaries or in_supersets

get_constraint_parameters(field_name)

Return the constraints on 'field_name'.

Parameters:

Name Type Description Default
field_name str

Field to get the constraint of.

required

Returns:

Type Description
dict

Dictionnary with keys 'boundary' and/or 'superset' if constraints exist.

Source code in src/bgc_data_processing/core/filtering.py
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
def get_constraint_parameters(self, field_name: str) -> dict:
    """Return the constraints on 'field_name'.

    Parameters
    ----------
    field_name : str
        Field to get the constraint of.

    Returns
    -------
    dict
        Dictionnary with keys 'boundary' and/or 'superset' if constraints exist.
    """
    constraint_params = {}
    if field_name in self.boundaries:
        constraint_params["boundary"] = self.boundaries[field_name]
    if field_name in self.supersets:
        constraint_params["superset"] = self.supersets[field_name]
    return constraint_params

get_extremes(field_name, default_min=None, default_max=None)

Return extreme values as they appear in the constraints.

Parameters:

Name Type Description Default
field_name str

Name of the field to get the extreme of.

required
default_min int | float | datetime

Default value for the minimum if not constraint exists., by default None

None
default_max int | float | datetime

Default value for the maximum if not constraint exists., by default None

None

Returns:

Type Description
tuple[int | float | datetime, int | float | datetime]

Minimum value, maximum value

Source code in src/bgc_data_processing/core/filtering.py
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
def get_extremes(
    self,
    field_name: str,
    default_min: int | float | datetime | None = None,
    default_max: int | float | datetime | None = None,
) -> tuple[int | float | datetime, int | float | datetime]:
    """Return extreme values as they appear in the constraints.

    Parameters
    ----------
    field_name : str
        Name of the field to get the extreme of.
    default_min : int | float | datetime, optional
        Default value for the minimum if not constraint exists., by default None
    default_max : int | float | datetime, optional
        Default value for the maximum if not constraint exists., by default None

    Returns
    -------
    tuple[int | float | datetime, int | float | datetime]
        Minimum value, maximum value
    """
    if not self.is_constrained(field_name=field_name):
        return default_min, default_max
    constraints = self.get_constraint_parameters(field_name=field_name)
    boundary_in = "boundary" in constraints
    superset_in = "superset" in constraints
    if boundary_in and superset_in and constraints["superset"]:
        b_min = constraints["boundary"]["min"]
        b_max = constraints["boundary"]["max"]
        s_min = min(constraints["superset"])
        s_max = max(constraints["superset"])
        all_min = min(b_min, s_min)
        all_max = max(b_max, s_max)
    elif not boundary_in:
        all_min = min(constraints["superset"])
        all_max = max(constraints["superset"])
    elif not superset_in:
        all_min = constraints["boundary"]["min"]
        all_max = constraints["boundary"]["max"]
    return all_min, all_max