Coverage for rulekit/arff.py: 87%
23 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-07 11:26 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-07 11:26 +0000
1"""Contains helper functions .arff files.
2"""
3import io
4from typing import Union
6import numpy as np
7import pandas as pd
8import requests
9from scipy.io import arff
12def _is_path_and_url(path: str) -> bool:
13 """Checks if the path is a http or https URL.
14 """
15 return path.startswith("http://") or path.startswith("https://")
18def _make_file_object_from_url(url: str) -> io.IOBase:
19 """Makes a file-like object from a http or https URL.
20 """
21 raw_text: str = requests.get(url, timeout=10).text
22 return io.StringIO(raw_text)
25def read_arff(
26 file_path_or_file: Union[str, io.IOBase],
27 encoding: str = "utf-8"
28) -> pd.DataFrame:
29 """Reads an .arff file and returns a pandas DataFrame.
31 This function offers a more convenient interface to read .arff files
32 than scipy.io arff.loadarff function. It also fix multiple
33 problems with it, such as handling missing "?" values and encoding
34 nominal columns.
36 Args:
37 file_path_or_file (str): Either path to the .arff file or a readable
38 file-like object. The path can also be a http or https URL.
39 encoding (str, optional): Optional file encoding. Defaults to "utf-8".
41 Returns:
42 pd.DataFrame: pandas DataFrame with the data
44 Example:
46 >>> # read from file path
47 >>> df: pd.DataFrame = read_arff('./cholesterol.arff')
48 >>>
49 >>> # read from file-like object
50 >>> with open('./cholesterol.arff', 'r') as f:
51 >>> df: pd.DataFrame = read_arff(f)
52 >>>
53 >>> # read from URL
54 >>> df: pd.DataFrame = read_arff(
55 >>> 'https://raw.githubusercontent.com' +
56 >>> '/adaa-polsl/RuleKit-python/master/tests' +
57 >>> '/additional_resources/cholesterol.arff'
58 >>> )
59 """
60 if (
61 isinstance(file_path_or_file, str) and
62 _is_path_and_url(file_path_or_file)
63 ):
64 file_path_or_file = _make_file_object_from_url(file_path_or_file)
66 df = pd.DataFrame(arff.loadarff(file_path_or_file)[0])
67 # code to change encoding of the file
68 decoded_df: pd.DataFrame = df.select_dtypes([np.object_])
69 if not decoded_df.empty:
70 decoded_df = decoded_df.stack().str.decode(encoding).unstack()
71 for col in decoded_df:
72 df[col] = decoded_df[col]
73 df = df.where(pd.notnull(df), None)
74 df = df.replace({'?': None})
75 return df