Coverage for rulekit/arff.py: 87%

23 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-07 11:26 +0000

1"""Contains helper functions .arff files. 

2""" 

3import io 

4from typing import Union 

5 

6import numpy as np 

7import pandas as pd 

8import requests 

9from scipy.io import arff 

10 

11 

12def _is_path_and_url(path: str) -> bool: 

13 """Checks if the path is a http or https URL. 

14 """ 

15 return path.startswith("http://") or path.startswith("https://") 

16 

17 

18def _make_file_object_from_url(url: str) -> io.IOBase: 

19 """Makes a file-like object from a http or https URL. 

20 """ 

21 raw_text: str = requests.get(url, timeout=10).text 

22 return io.StringIO(raw_text) 

23 

24 

25def read_arff( 

26 file_path_or_file: Union[str, io.IOBase], 

27 encoding: str = "utf-8" 

28) -> pd.DataFrame: 

29 """Reads an .arff file and returns a pandas DataFrame. 

30 

31 This function offers a more convenient interface to read .arff files 

32 than scipy.io arff.loadarff function. It also fix multiple 

33 problems with it, such as handling missing "?" values and encoding 

34 nominal columns. 

35 

36 Args: 

37 file_path_or_file (str): Either path to the .arff file or a readable 

38 file-like object. The path can also be a http or https URL. 

39 encoding (str, optional): Optional file encoding. Defaults to "utf-8". 

40 

41 Returns: 

42 pd.DataFrame: pandas DataFrame with the data 

43 

44 Example: 

45 

46 >>> # read from file path 

47 >>> df: pd.DataFrame = read_arff('./cholesterol.arff') 

48 >>> 

49 >>> # read from file-like object 

50 >>> with open('./cholesterol.arff', 'r') as f: 

51 >>> df: pd.DataFrame = read_arff(f) 

52 >>> 

53 >>> # read from URL 

54 >>> df: pd.DataFrame = read_arff( 

55 >>> 'https://raw.githubusercontent.com' + 

56 >>> '/adaa-polsl/RuleKit-python/master/tests' + 

57 >>> '/additional_resources/cholesterol.arff' 

58 >>> ) 

59 """ 

60 if ( 

61 isinstance(file_path_or_file, str) and 

62 _is_path_and_url(file_path_or_file) 

63 ): 

64 file_path_or_file = _make_file_object_from_url(file_path_or_file) 

65 

66 df = pd.DataFrame(arff.loadarff(file_path_or_file)[0]) 

67 # code to change encoding of the file 

68 decoded_df: pd.DataFrame = df.select_dtypes([np.object_]) 

69 if not decoded_df.empty: 

70 decoded_df = decoded_df.stack().str.decode(encoding).unstack() 

71 for col in decoded_df: 

72 df[col] = decoded_df[col] 

73 df = df.where(pd.notnull(df), None) 

74 df = df.replace({'?': None}) 

75 return df