Coverage for tests/test_classifier.py: 99%

184 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-07 11:26 +0000

1import os 

2import threading 

3import unittest 

4 

5import numpy as np 

6import pandas as pd 

7import sklearn.tree as scikit 

8from scipy.io import arff 

9from sklearn import metrics 

10from sklearn.datasets import load_iris 

11from sklearn.preprocessing import LabelEncoder 

12 

13from rulekit import classification 

14from rulekit.events import RuleInductionProgressListener 

15from rulekit.params import Measures 

16from rulekit.rules import ClassificationRule 

17from tests.utils import assert_accuracy_is_greater 

18from tests.utils import assert_rules_are_equals 

19from tests.utils import dir_path 

20from tests.utils import get_test_cases 

21 

22 

23class TestClassifier(unittest.TestCase): 

24 

25 def test_classification_accuracy_on_iris(self): 

26 scikit_clf = scikit.DecisionTreeClassifier() 

27 rulekit_clf = classification.RuleClassifier() 

28 x, y = load_iris(return_X_y=True) 

29 

30 scikit_clf.fit(x, y) 

31 rulekit_clf.fit(x, y) 

32 scikit_prediction = scikit_clf.predict(x) 

33 rulekit_prediction = rulekit_clf.predict(x) 

34 scikit_accuracy = metrics.accuracy_score(y, scikit_prediction) 

35 rulekit_accuracy = metrics.accuracy_score(y, rulekit_prediction) 

36 

37 assert ( 

38 abs(scikit_accuracy - rulekit_accuracy) < 0.04 

39 ), "RuleKit model should perform similar to scikit model" 

40 

41 def test_induction_progress_listener(self): 

42 rulekit_clf = classification.RuleClassifier() 

43 x, y = load_iris(return_X_y=True) 

44 

45 class EventListener(RuleInductionProgressListener): 

46 

47 lock = threading.Lock() 

48 induced_rules_count = 0 

49 on_progress_calls_count = 0 

50 

51 def on_new_rule(self, rule: ClassificationRule): 

52 self.lock.acquire() 

53 self.induced_rules_count += 1 

54 self.lock.release() 

55 

56 def on_progress( 

57 self, total_examples_count: int, uncovered_examples_count: int 

58 ): 

59 self.lock.acquire() 

60 self.on_progress_calls_count += 1 

61 self.lock.release() 

62 

63 listener = EventListener() 

64 rulekit_clf.add_event_listener(listener) 

65 rulekit_clf.fit(x, y) 

66 rules_count = len(rulekit_clf.model.rules) 

67 self.assertEqual(rules_count, listener.induced_rules_count) 

68 self.assertEqual(rules_count, listener.on_progress_calls_count) 

69 

70 def test_getting_examples_coverage(self): 

71 clf = classification.RuleClassifier() 

72 x, y = load_iris(return_X_y=True) 

73 

74 clf.fit(x, y) 

75 

76 coverage_matrix = clf.get_coverage_matrix(x) 

77 num_rows, num_cols = coverage_matrix.shape 

78 

79 self.assertEqual( 

80 num_rows, 

81 len(x), 

82 "Coverage matrix should have as many rows as examples in dataset", 

83 ) 

84 self.assertEqual( 

85 num_cols, 

86 len(clf.model.rules), 

87 "Coverage matrix should have as many cols as rules in ruleset", 

88 ) 

89 

90 def test_classification_metrics(self): 

91 clf = classification.RuleClassifier() 

92 x, y = load_iris(return_X_y=True) 

93 

94 clf.fit(x, y) 

95 y_pred, m = clf.predict(x, return_metrics=True) 

96 self.assertEqual(len(y_pred), len(y)) 

97 self.assertIsNotNone( 

98 m["rules_per_example"], "rules_per_example should be calculated" 

99 ) 

100 self.assertIsNotNone( 

101 m["voting_conflicts"], "rules_per_example should be calculated" 

102 ) 

103 

104 def test_score(self): 

105 clf = classification.RuleClassifier() 

106 X, y = load_iris(return_X_y=True) 

107 

108 clf.fit(X, y) 

109 rulekit_acc: float = clf.score(X, y) 

110 sklearn_acc: float = metrics.accuracy_score(y, clf.predict(X)) 

111 

112 self.assertAlmostEqual( 

113 rulekit_acc, sklearn_acc, places=3, msg="Accuracy should be the same" 

114 ) 

115 

116 def test_fit_on_integer_labels(self): 

117 clf1 = classification.RuleClassifier() 

118 clf2 = classification.RuleClassifier() 

119 X, y_num = load_iris(return_X_y=True) 

120 y_str: np.ndarray = y_num.astype(str) 

121 

122 clf1.fit(X, y_num) 

123 clf2.fit(X, y_str) 

124 

125 self.assertTrue( 

126 isinstance(clf1.predict(X)[0], float), 

127 "Predictions should be of the same type as labels in the training set", 

128 ) 

129 self.assertTrue( 

130 isinstance(clf2.predict(X)[0], str), 

131 "Predictions should be of the same type as labels in the training set", 

132 ) 

133 

134 def test_classification_predict_proba(self): 

135 clf = classification.RuleClassifier() 

136 x, y = load_iris(return_X_y=True) 

137 

138 clf.fit(x, y) 

139 confidence_matrix, m = clf.predict_proba(x, return_metrics=True) 

140 for row in confidence_matrix: 

141 sum = 0 

142 for col in row: 

143 sum += col 

144 self.assertAlmostEqual(sum, 1, 3, "Confidence matrix rows should sum to 1") 

145 

146 def test_prediction_results_mapping(self): 

147 """ 

148 This method tests classifications on numeric labels which possible values does 

149 not start from 0. RuleKit undehood maps all labels values to integer values starting 

150 from 0 to N (counting by order of appearance in dataset). Those maped values must be 

151 later remaped back to actual label value. This test verifies that predict method returns 

152 correct (remaped) label value. 

153 """ 

154 clf = classification.RuleClassifier() 

155 

156 # some trivial dataset - OR (2 = false, 3 = true) 

157 x = np.array([[0, 1], [1, 1], [1, 0], [0, 0]]) 

158 y = np.array([0.0, 1.0, 0.0, 0.0]) 

159 clf.fit(x, y) 

160 prediction = clf.predict(x) 

161 

162 self.assertEqual(y.all(), prediction.all()) 

163 

164 def test_prediction_on_nominal_values(self): 

165 clf = classification.RuleClassifier() 

166 

167 # some trivial dataset - AND Gate 

168 x = np.array([[0, 1], [1, 1], [1, 0], [0, 0]]) 

169 y = np.array(["false", "true", "false", "false"]) 

170 clf.fit(x, y) 

171 prediction = clf.predict(x) 

172 

173 self.assertTrue(np.array_equal(y, prediction)) 

174 

175 def test_compare_with_java_results(self): 

176 test_cases = get_test_cases("ClassificationSnCTest") 

177 

178 for test_case in test_cases: 

179 params = test_case.induction_params 

180 tree = classification.RuleClassifier(**params) 

181 example_set = test_case.example_set 

182 tree.fit(example_set.values, example_set.labels) 

183 model = tree.model 

184 expected = test_case.reference_report.rules 

185 actual = list(map(str, model.rules)) 

186 assert_rules_are_equals(expected, actual) 

187 assert_accuracy_is_greater( 

188 tree.predict(example_set.values), example_set.labels, 0.9 

189 ) 

190 

191 def test_predict_proba(self): 

192 test_case = get_test_cases("ClassificationSnCTest")[0] 

193 params = test_case.induction_params 

194 clf = classification.RuleClassifier(**params) 

195 example_set = test_case.example_set 

196 clf.fit( 

197 example_set.values, 

198 example_set.labels, 

199 ) 

200 res = clf.predict_proba(example_set.values) 

201 self.assertEqual( 

202 res.shape[0], 

203 example_set.values.shape[0], 

204 "Should have as many rows as the original dataset", 

205 ) 

206 self.assertEqual( 

207 res.shape[1], 

208 np.unique(example_set.labels).shape[0], 

209 "Should have as many columns as there are classes in the dataset", 

210 ) 

211 self.assertTrue( 

212 res.max() <= 1 and res.min() >= 0, 

213 "Predicted probabilities should be in range [0, 1]", 

214 ) 

215 

216 def test_fit_and_predict_on_boolean_columns(self): 

217 test_case = get_test_cases("ClassificationSnCTest")[0] 

218 params = test_case.induction_params 

219 clf = classification.RuleClassifier(**params) 

220 X, y = test_case.example_set.values, test_case.example_set.labels 

221 X["boolean_column"] = np.random.randint(low=0, high=2, size=X.shape[0]).astype( 

222 bool 

223 ) 

224 clf.fit(X, y) 

225 clf.predict(X) 

226 

227 y = y.astype(bool) 

228 clf.fit(X, y) 

229 clf.predict(X) 

230 

231 y = pd.Series(y) 

232 clf.fit(X, y) 

233 clf.predict(X) 

234 

235 

236class TestExperClassifier(unittest.TestCase): 

237 

238 def test_compare_with_java_results(self): 

239 test_cases = get_test_cases("ClassificationExpertSnCTest") 

240 

241 for test_case in test_cases: 

242 params = test_case.induction_params 

243 clf = classification.ExpertRuleClassifier(**params) 

244 example_set = test_case.example_set 

245 clf.fit( 

246 example_set.values, 

247 example_set.labels, 

248 expert_rules=test_case.knowledge.expert_rules, 

249 expert_preferred_conditions=test_case.knowledge.expert_preferred_conditions, 

250 expert_forbidden_conditions=test_case.knowledge.expert_forbidden_conditions, 

251 ) 

252 model = clf.model 

253 expected = test_case.reference_report.rules 

254 actual = list(map(str, model.rules)) 

255 assert_rules_are_equals(expected, actual) 

256 

257 def test_predict_proba(self): 

258 test_case = get_test_cases("ClassificationExpertSnCTest")[0] 

259 params = test_case.induction_params 

260 clf = classification.ExpertRuleClassifier(**params) 

261 example_set = test_case.example_set 

262 clf.fit( 

263 example_set.values, 

264 example_set.labels, 

265 expert_rules=test_case.knowledge.expert_rules, 

266 expert_preferred_conditions=test_case.knowledge.expert_preferred_conditions, 

267 expert_forbidden_conditions=test_case.knowledge.expert_forbidden_conditions, 

268 ) 

269 res = clf.predict_proba(example_set.values) 

270 self.assertEqual( 

271 res.shape[0], 

272 example_set.values.shape[0], 

273 "Should have as many rows as the original dataset", 

274 ) 

275 self.assertEqual( 

276 res.shape[1], 

277 np.unique(example_set.labels).shape[0], 

278 "Should have as many columns as there are classes in the dataset", 

279 ) 

280 self.assertTrue( 

281 res.max() <= 1 and res.min() >= 0, 

282 "Predicted probabilities should be in range [0, 1]", 

283 ) 

284 

285 # Issue #17 

286 def test_left_open_intervals_in_expert_induction(self): 

287 df = pd.DataFrame( 

288 arff.loadarff( 

289 f"{dir_path}/resources/data/seismic-bumps-train-minimal.arff" 

290 )[0] 

291 ) 

292 X = df.drop("class", axis=1) 

293 y = df["class"] 

294 

295 expert_rules = [ 

296 ("rule-0", "IF [[gimpuls = <-inf, 750)]] THEN class = {0}"), 

297 ("rule-1", "IF [[gimpuls = (750, inf)]] THEN class = {1}"), 

298 ] 

299 

300 expert_preferred_conditions = [ 

301 ("preferred-condition-0", "1: IF [[seismic = {a}]] THEN class = {0}"), 

302 ("preferred-attribute-0", "1: IF [[gimpuls = Any]] THEN class = {1}"), 

303 ] 

304 

305 expert_forbidden_conditions = [ 

306 ("forb-attribute-0", "1: IF [[seismoacoustic = Any]] THEN class = {0}"), 

307 ("forb-attribute-1", "inf: IF [[ghazard = Any]] THEN class = {1}"), 

308 ] 

309 clf = classification.ExpertRuleClassifier( 

310 minsupp_new=8, 

311 max_growing=0, 

312 extend_using_preferred=True, 

313 extend_using_automatic=True, 

314 induce_using_preferred=True, 

315 induce_using_automatic=True, 

316 ) 

317 clf.fit( 

318 X, 

319 y, 

320 expert_rules=expert_rules, 

321 expert_preferred_conditions=expert_preferred_conditions, 

322 expert_forbidden_conditions=expert_forbidden_conditions, 

323 ) 

324 

325 # @unittest.skip( 

326 # "Skipped due to known bug https://bitbucket.org/polsl-a/rules/issues/126" 

327 # ) 

328 def test_refining_conditions_for_nominal_attributes(self): 

329 df: pd.DataFrame = pd.read_csv( 

330 os.path.join(dir_path, "additional_resources", "salary.csv") 

331 ) 

332 X, y = df.drop("Salary", axis=1), df["Salary"] 

333 

334 # Run experiment using python API 

335 clf = classification.ExpertRuleClassifier( 

336 induction_measure=Measures.C2, 

337 pruning_measure=Measures.C2, 

338 voting_measure=Measures.C2, 

339 complementary_conditions=True, 

340 extend_using_preferred=False, 

341 extend_using_automatic=False, 

342 induce_using_preferred=False, 

343 induce_using_automatic=False, 

344 preferred_conditions_per_rule=0, 

345 preferred_attributes_per_rule=0, 

346 consider_other_classes=False, 

347 ) 

348 clf.fit( 

349 X, 

350 y, 

351 expert_rules=[ 

352 ( 

353 "expert_rules-1", 

354 ( 

355 "IF Age = (-inf, 33.0) AND Job Title @= {Software Engineer} THEN " 

356 "Salary = {below average}" 

357 ), 

358 ) 

359 ], 

360 ) 

361 self.assertEqual( 

362 [ 

363 ( 

364 "IF [[Age = (-inf, 33)]] AND [[Job Title = {Software Engineer}]] THEN " 

365 "Salary = {below average}" 

366 ) 

367 ], 

368 [str(r) for r in clf.model.rules], 

369 ( 

370 "Ruleset should contain only a single rule configured by expert with " 

371 "a refined condition" 

372 ), 

373 ) 

374 clf.fit( 

375 X, 

376 y, 

377 expert_rules=[ 

378 ( 

379 "expert_rules-1", 

380 "IF Age = (-inf, 33.0) AND Job Title @= Any THEN Salary = {below average}", 

381 ) 

382 ], 

383 ) 

384 self.assertEqual( 

385 [ 

386 ( 

387 "IF [[Age = (-inf, 33)]] AND [[Job Title = {Marketing Analyst}]] " 

388 "THEN Salary = {below average}" 

389 ) 

390 ], 

391 [str(r) for r in clf.model.rules], 

392 ( 

393 "Ruleset should contain only a single rule configured by expert with " 

394 "a refined condition" 

395 ), 

396 ) 

397 

398 

399if __name__ == "__main__": 

400 unittest.main()