Coverage for tests/test_regression.py: 99%

116 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-07 11:26 +0000

1import os 

2import threading 

3import unittest 

4 

5import numpy as np 

6import pandas as pd 

7 

8from rulekit import regression 

9from rulekit.arff import read_arff 

10from rulekit.events import RuleInductionProgressListener 

11from rulekit.main import RuleKit 

12from rulekit.params import Measures 

13from rulekit.rules import RegressionRule 

14from rulekit.rules import RuleSet 

15from tests.utils import assert_rules_are_equals 

16from tests.utils import assert_score_is_greater 

17from tests.utils import dir_path 

18from tests.utils import get_test_cases 

19 

20 

21class TestRegressor(unittest.TestCase): 

22 

23 @classmethod 

24 def setUpClass(cls): 

25 RuleKit.init() 

26 

27 def test_induction_progress_listener(self): 

28 test_case = get_test_cases("RegressionSnCTest")[0] 

29 

30 reg = regression.RuleRegressor() 

31 example_set = test_case.example_set 

32 MAX_RULES = 3 

33 

34 class EventListener(RuleInductionProgressListener): 

35 

36 lock = threading.Lock() 

37 induced_rules_count = 0 

38 on_progress_calls_count = 0 

39 

40 def on_new_rule(self, rule: RegressionRule): 

41 self.lock.acquire() 

42 self.induced_rules_count += 1 

43 self.lock.release() 

44 

45 def on_progress( 

46 self, total_examples_count: int, uncovered_examples_count: int 

47 ): 

48 self.lock.acquire() 

49 self.on_progress_calls_count += 1 

50 self.lock.release() 

51 

52 def should_stop(self) -> bool: 

53 self.lock.acquire() 

54 should_stop = self.induced_rules_count == MAX_RULES 

55 self.lock.release() 

56 return should_stop 

57 

58 listener = EventListener() 

59 reg.add_event_listener(listener) 

60 reg.fit(example_set.values, example_set.labels) 

61 

62 rules_count = len(reg.model.rules) 

63 self.assertEqual(rules_count, MAX_RULES) 

64 self.assertEqual(rules_count, listener.on_progress_calls_count) 

65 

66 def test_compare_with_java_results(self): 

67 test_cases = get_test_cases("RegressionSnCTest") 

68 

69 for test_case in test_cases: 

70 params = test_case.induction_params 

71 tree = regression.RuleRegressor(**params) 

72 example_set = test_case.example_set 

73 tree.fit(example_set.values, example_set.labels) 

74 model = tree.model 

75 expected = test_case.reference_report.rules 

76 actual = [str(r) for r in model.rules] 

77 assert_rules_are_equals(expected, actual) 

78 assert_score_is_greater( 

79 tree.predict(example_set.values), example_set.labels, 0.7 

80 ) 

81 

82 def test_fit_and_predict_on_boolean_columns(self): 

83 test_case = get_test_cases("RegressionSnCTest")[0] 

84 params = test_case.induction_params 

85 clf = regression.RuleRegressor(**params) 

86 X, y = test_case.example_set.values, test_case.example_set.labels 

87 X["boolean_column"] = np.random.randint(low=0, high=2, size=X.shape[0]).astype( 

88 bool 

89 ) 

90 clf.fit(X, y) 

91 clf.predict(X) 

92 

93 y = pd.Series(y) 

94 clf.fit(X, y) 

95 clf.predict(X) 

96 

97 def test_cholesterol(self): 

98 resources_dir: str = os.path.join(dir_path, "additional_resources") 

99 df: pd.DataFrame = read_arff( 

100 os.path.join(resources_dir, "cholesterol.arff")) 

101 X, y = df.drop("class", axis=1), df["class"] 

102 

103 # Run experiment using python API 

104 reg = regression.RuleRegressor( 

105 minsupp_new=0.05, 

106 max_uncovered_fraction=0.0, 

107 max_growing=0.0, 

108 induction_measure=Measures.Accuracy, 

109 pruning_measure=Measures.Accuracy, 

110 voting_measure=Measures.Accuracy, 

111 ignore_missing=False, 

112 select_best_candidate=False, 

113 complementary_conditions=True, 

114 max_rule_count=0, 

115 ) 

116 reg.fit(X, y) 

117 actual_rules: list[str] = list(map(str, reg.model.rules)) 

118 expected_rules: list[str] = [ 

119 "IF trestbps = (-inf, 149) THEN class = {244.84} [192.73,296.96]", 

120 "IF trestbps = <122, inf) THEN class = {250.80} [201.79,299.80]", 

121 ] 

122 self.assertEqual(actual_rules, expected_rules) 

123 

124 

125class TestExpertRegressor(unittest.TestCase): 

126 

127 @classmethod 

128 def setUpClass(cls): 

129 RuleKit.init() 

130 

131 def test_compare_with_java_results(self): 

132 test_cases = get_test_cases("RegressionExpertSnCTest") 

133 

134 for test_case in test_cases: 

135 params = test_case.induction_params 

136 tree = regression.ExpertRuleRegressor(**params) 

137 example_set = test_case.example_set 

138 tree.fit( 

139 example_set.values, 

140 example_set.labels, 

141 expert_rules=test_case.knowledge.expert_rules, 

142 expert_preferred_conditions=( 

143 test_case.knowledge.expert_preferred_conditions 

144 ), 

145 expert_forbidden_conditions=( 

146 test_case.knowledge.expert_forbidden_conditions 

147 ), 

148 ) 

149 model = tree.model 

150 expected = test_case.reference_report.rules 

151 actual = [str(r) for r in model.rules] 

152 assert_rules_are_equals(expected, actual) 

153 assert_score_is_greater( 

154 tree.predict(example_set.values), example_set.labels, 0.66 

155 ) 

156 

157 def test_legacy_expert_rules_format(self): 

158 """Test if the legacy expert rules format is still supported. 

159 In legacy format rules strings contains conclusion in the form 

160 of "label_attr = {NaN}". In new format conclusion part should be empty 

161 """ 

162 df: pd.DataFrame = read_arff( 

163 os.path.join(dir_path, "additional_resources", "cholesterol.arff") 

164 ) 

165 X, y = df.drop("class", axis=1), df["class"] 

166 

167 # Run experiment using python API 

168 reg = regression.ExpertRuleRegressor( 

169 mean_based_regression=True, extend_using_automatic=True 

170 ) 

171 expert_rule = "IF trestbps < 149 THEN class = {NaN}" 

172 legacy_ruleset: RuleSet[RegressionRule] = reg.fit( 

173 X, y, expert_rules=[("0", expert_rule)] 

174 ) 

175 new_format_ruleset: RuleSet[RegressionRule] = reg.fit( 

176 X, y, expert_rules=[("rule-0", expert_rule.split("class")[0])] 

177 ) 

178 legacy_rules: list[str] = list(map(str, legacy_ruleset.rules)) 

179 new_format_rules: list[str] = list(map(str, new_format_ruleset.rules)) 

180 self.assertEqual(legacy_rules, new_format_rules) 

181 

182 def test_refining_conditions_for_nominal_attributes(self): 

183 df: pd.DataFrame = read_arff( 

184 os.path.join(dir_path, "additional_resources", "cholesterol.arff") 

185 ) 

186 X, y = df.drop("class", axis=1), df["class"] 

187 

188 # Run experiment using python API 

189 clf = regression.ExpertRuleRegressor( 

190 induction_measure=Measures.C2, 

191 pruning_measure=Measures.C2, 

192 voting_measure=Measures.C2, 

193 complementary_conditions=True, 

194 extend_using_preferred=False, 

195 extend_using_automatic=False, 

196 induce_using_preferred=False, 

197 induce_using_automatic=False, 

198 preferred_conditions_per_rule=0, 

199 preferred_attributes_per_rule=0, 

200 ) 

201 clf.fit(X, y, expert_rules=[("expert_rules-1", "IF sex @= {1} THEN")]) 

202 self.assertEqual( 

203 ["IF [[sex = {1}]] THEN class = {239.60} [197.06,282.15]"], 

204 [str(r) for r in clf.model.rules], 

205 "Ruleset should contain only a single rule configured by expert", 

206 ) 

207 

208 clf.fit(X, y, expert_rules=[("expert_rules-1", "IF sex @= Any THEN")]) 

209 self.assertEqual( 

210 ["IF [[sex = {1}]] THEN class = {239.60} [197.06,282.15]"], 

211 [str(r) for r in clf.model.rules], 

212 ( 

213 "Ruleset should contain only a single rule configured by expert with " 

214 "a refined condition" 

215 ), 

216 ) 

217 

218 

219if __name__ == "__main__": 

220 unittest.main()