Skip to content

bgc_data_processing.core.io.readers

Read generated files.

Reader(filepath, providers_column_label='PROVIDER', expocode_column_label='EXPOCODE', date_column_label='DATE', year_column_label='YEAR', month_column_label='MONTH', day_column_label='DAY', hour_column_label='HOUR', latitude_column_label='LATITUDE', longitude_column_label='LONGITUDE', depth_column_label='DEPH', variables_reference=None, category='in_situ', unit_row_index=1, delim_whitespace=True)

Reading routine to parse csv files.

Parameters:

Name Type Description Default
filepath Path | str

Path to the file to read.

required
providers_column_label str

Provider column in the dataframe., by default "PROVIDER"

'PROVIDER'
expocode_column_label str

Expocode column in the dataframe., by default "EXPOCODE"

'EXPOCODE'
date_column_label str

Date column in the dataframe., by default "DATE"

'DATE'
year_column_label str

Year column in the dataframe., by default "YEAR"

'YEAR'
month_column_label str

Month column in the dataframe., by default "MONTH"

'MONTH'
day_column_label str

Day column in the dataframe., by default "DAY"

'DAY'
hour_column_label str

Hour column in the dataframe., by default "HOUR"

'HOUR'
latitude_column_label str

Latitude column in the dataframe., by default "LATITUDE"

'LATITUDE'
longitude_column_label str

Longitude column in the dataframe., by default "LONGITUDE"

'LONGITUDE'
depth_column_label str

Depth column in the dataframe., by default "DEPH"

'DEPH'
variables_reference list[BaseVar] | None

List of variable to use as reference. If a variable label is a column name, this variable will be used for the output storer., by default None

None
category str

Category of the loaded file., by default "in_situ"

'in_situ'
unit_row_index int

Index of the row with the units, None if there's no unit row., by default 1

1
delim_whitespace bool

Whether to use whitespace as delimiters., by default True

True

Examples:

Loading from a file:

>>> filepath = "path/to/file"
>>> reader = Reader(filepath, providers="providers_column_name")

Getting the storer:

>>> storer = reader.get_storer()
Source code in src/bgc_data_processing/core/io/readers.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
def __init__(
    self,
    filepath: Path | str,
    providers_column_label: str = "PROVIDER",
    expocode_column_label: str = "EXPOCODE",
    date_column_label: str = "DATE",
    year_column_label: str = "YEAR",
    month_column_label: str = "MONTH",
    day_column_label: str = "DAY",
    hour_column_label: str = "HOUR",
    latitude_column_label: str = "LATITUDE",
    longitude_column_label: str = "LONGITUDE",
    depth_column_label: str = "DEPH",
    variables_reference: list[BaseVar] | None = None,
    category: str = "in_situ",
    unit_row_index: int = 1,
    delim_whitespace: bool = True,
):
    if variables_reference is None:
        variables_reference: dict[str, BaseVar] = {}
    else:
        self._reference_vars = {var.label: var for var in variables_reference}

    raw_df, unit_row = self._read(
        filepath=Path(filepath),
        unit_row_index=unit_row_index,
        delim_whitespace=delim_whitespace,
    )
    mandatory_vars = {
        providers_column_label: "provider",
        expocode_column_label: "expocode",
        date_column_label: "date",
        year_column_label: "year",
        month_column_label: "month",
        day_column_label: "day",
        hour_column_label: "hour",
        latitude_column_label: "latitude",
        longitude_column_label: "longitude",
        depth_column_label: "depth",
    }
    self._category = category
    if providers_column_label is not None:
        self._providers = raw_df[providers_column_label].unique().tolist()
    else:
        self._providers = ["????"]
    self._data = self._add_date_columns(
        raw_df,
        year_column_label,
        month_column_label,
        day_column_label,
        date_column_label,
    )
    self._variables = self._get_variables(raw_df, unit_row, mandatory_vars)

get_storer()

Return the Storer storing the data loaded.

Returns:

Type Description
Storer

Contains the data from the csv.

Source code in src/bgc_data_processing/core/io/readers.py
392
393
394
395
396
397
398
399
400
401
402
403
404
405
def get_storer(self) -> "Storer":
    """Return the Storer storing the data loaded.

    Returns
    -------
    Storer
        Contains the data from the csv.
    """
    return Storer(
        data=self._data,
        category=self._category,
        providers=self._providers,
        variables=self._variables.storing_variables,
    )

read_files(filepath, providers_column_label='PROVIDER', expocode_column_label='EXPOCODE', date_column_label='DATE', year_column_label='YEAR', month_column_label='MONTH', day_column_label='DAY', hour_column_label='HOUR', latitude_column_label='LATITUDE', longitude_column_label='LONGITUDE', depth_column_label='DEPH', variables_reference=None, category='in_situ', unit_row_index=1, delim_whitespace=True)

Build Storer reading data from csv or txt files.

Parameters:

Name Type Description Default
filepath Path | str | list[Path] | list[str]

Path to the file to read.

required
providers_column_label str

Provider column in the dataframe., by default "PROVIDER"

'PROVIDER'
expocode_column_label str

Expocode column in the dataframe., by default "EXPOCODE"

'EXPOCODE'
date_column_label str

Date column in the dataframe., by default "DATE"

'DATE'
year_column_label str

Year column in the dataframe., by default "YEAR"

'YEAR'
month_column_label str

Month column in the dataframe., by default "MONTH"

'MONTH'
day_column_label str

Day column in the dataframe., by default "DAY"

'DAY'
hour_column_label str

Hour column in the dataframe., by default "HOUR"

'HOUR'
latitude_column_label str

Latitude column in the dataframe., by default "LATITUDE"

'LATITUDE'
longitude_column_label str

Longitude column in the dataframe., by default "LONGITUDE"

'LONGITUDE'
depth_column_label str

Depth column in the dataframe., by default "DEPH"

'DEPH'
variables_reference list[BaseVar] | None

List of variable to use as reference. If a variable label is a column name, this variable will be used for the output storer., by default None

None
category str

Category of the loaded file., by default "in_situ"

'in_situ'
unit_row_index int

Index of the row with the units, None if there's no unit row., by default 1

1
delim_whitespace bool

Whether to use whitespace as delimiters., by default True

True

Returns:

Type Description
Storer

Storer aggregating the data from all the files

Raises:

Type Description
TypeError

If filepath argument is not an instance of string or list.

Examples:

Loading from a single file:

>>> filepath = "path/to/file"
>>> storer = read_files(filepath, providers="providers_column_name")

Loading from multiple files:

>>> filepaths = [
...     "path/to/file1",
...     "path/to/file2",
... ]
>>> storer = read_files(
...     filepaths,
... )
Source code in src/bgc_data_processing/core/io/readers.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def read_files(
    filepath: Path | str | list[Path] | list[str],
    providers_column_label: str = "PROVIDER",
    expocode_column_label: str = "EXPOCODE",
    date_column_label: str = "DATE",
    year_column_label: str = "YEAR",
    month_column_label: str = "MONTH",
    day_column_label: str = "DAY",
    hour_column_label: str = "HOUR",
    latitude_column_label: str = "LATITUDE",
    longitude_column_label: str = "LONGITUDE",
    depth_column_label: str = "DEPH",
    variables_reference: list[BaseVar] | None = None,
    category: str = "in_situ",
    unit_row_index: int = 1,
    delim_whitespace: bool = True,
) -> "Storer":
    """Build Storer reading data from csv or txt files.

    Parameters
    ----------
    filepath : Path | str | list[Path] | list[str]
        Path to the file to read.
    providers_column_label : str, optional
        Provider column in the dataframe., by default "PROVIDER"
    expocode_column_label : str, optional
        Expocode column in the dataframe., by default "EXPOCODE"
    date_column_label : str, optional
        Date column in the dataframe., by default "DATE"
    year_column_label : str, optional
        Year column in the dataframe., by default "YEAR"
    month_column_label : str, optional
        Month column in the dataframe., by default "MONTH"
    day_column_label : str, optional
        Day column in the dataframe., by default "DAY"
    hour_column_label : str, optional
        Hour column in the dataframe., by default "HOUR"
    latitude_column_label : str, optional
        Latitude column in the dataframe., by default "LATITUDE"
    longitude_column_label : str, optional
        Longitude column in the dataframe., by default "LONGITUDE"
    depth_column_label : str, optional
        Depth column in the dataframe., by default "DEPH"
    variables_reference: list[BaseVar] | None
        List of variable to use as reference. If a variable label is a column name,
         this variable will be used for the output storer., by default None
    category : str, optional
        Category of the loaded file., by default "in_situ"
    unit_row_index : int, optional
        Index of the row with the units, None if there's no unit row., by default 1
    delim_whitespace : bool, optional
        Whether to use whitespace as delimiters., by default True

    Returns
    -------
    Storer
        Storer aggregating the data from all the files

    Raises
    ------
    TypeError
        If filepath argument is not an instance of string or list.

    Examples
    --------
    Loading from a single file:
    >>> filepath = "path/to/file"
    >>> storer = read_files(filepath, providers="providers_column_name")

    Loading from multiple files:
    >>> filepaths = [
    ...     "path/to/file1",
    ...     "path/to/file2",
    ... ]
    >>> storer = read_files(
    ...     filepaths,
    ... )
    """
    if isinstance(filepath, list):
        storers = []
        for path in filepath:
            storer = read_files(
                filepath=path,
                providers_column_label=providers_column_label,
                expocode_column_label=expocode_column_label,
                date_column_label=date_column_label,
                year_column_label=year_column_label,
                month_column_label=month_column_label,
                day_column_label=day_column_label,
                hour_column_label=hour_column_label,
                latitude_column_label=latitude_column_label,
                longitude_column_label=longitude_column_label,
                depth_column_label=depth_column_label,
                variables_reference=variables_reference,
                category=category,
                unit_row_index=unit_row_index,
                delim_whitespace=delim_whitespace,
            )

            storers.append(storer)
        return sum(storers)
    if isinstance(filepath, Path):
        path = filepath
    elif isinstance(filepath, str):
        path = Path(filepath)
    else:
        error_msg = (
            f"Can't read filepaths from {filepath}. Accepted types are Path or str."
        )
        raise TypeError(error_msg)
    reader = Reader(
        filepath=path,
        providers_column_label=providers_column_label,
        expocode_column_label=expocode_column_label,
        date_column_label=date_column_label,
        year_column_label=year_column_label,
        month_column_label=month_column_label,
        day_column_label=day_column_label,
        hour_column_label=hour_column_label,
        latitude_column_label=latitude_column_label,
        longitude_column_label=longitude_column_label,
        depth_column_label=depth_column_label,
        variables_reference=variables_reference,
        category=category,
        unit_row_index=unit_row_index,
        delim_whitespace=delim_whitespace,
    )
    return reader.get_storer()