Coverage for tests/test_classifier.py: 99%

1import os

2import threading

3import unittest

5import numpy as np

6import pandas as pd

7import sklearn.tree as scikit

8from scipy.io import arff

9from sklearn import metrics

10from sklearn.datasets import load_iris

11from sklearn.preprocessing import LabelEncoder

13from rulekit import classification

14from rulekit.events import RuleInductionProgressListener

15from rulekit.params import Measures

16from rulekit.rules import ClassificationRule

17from tests.utils import assert_accuracy_is_greater

18from tests.utils import assert_rules_are_equals

19from tests.utils import dir_path

20from tests.utils import get_test_cases

23class TestClassifier(unittest.TestCase):

25 def test_classification_accuracy_on_iris(self):

26 scikit_clf = scikit.DecisionTreeClassifier()

27 rulekit_clf = classification.RuleClassifier()

28 x, y = load_iris(return_X_y=True)

30 scikit_clf.fit(x, y)

31 rulekit_clf.fit(x, y)

32 scikit_prediction = scikit_clf.predict(x)

33 rulekit_prediction = rulekit_clf.predict(x)

34 scikit_accuracy = metrics.accuracy_score(y, scikit_prediction)

35 rulekit_accuracy = metrics.accuracy_score(y, rulekit_prediction)

37 assert (

38 abs(scikit_accuracy - rulekit_accuracy) < 0.04

39 ), "RuleKit model should perform similar to scikit model"

41 def test_induction_progress_listener(self):

42 rulekit_clf = classification.RuleClassifier()

43 x, y = load_iris(return_X_y=True)

45 class EventListener(RuleInductionProgressListener):

47 lock = threading.Lock()

48 induced_rules_count = 0

49 on_progress_calls_count = 0

51 def on_new_rule(self, rule: ClassificationRule):

52 self.lock.acquire()

53 self.induced_rules_count += 1

54 self.lock.release()

56 def on_progress(

57 self, total_examples_count: int, uncovered_examples_count: int

58 ):

59 self.lock.acquire()

60 self.on_progress_calls_count += 1

61 self.lock.release()

63 listener = EventListener()

64 rulekit_clf.add_event_listener(listener)

65 rulekit_clf.fit(x, y)

66 rules_count = len(rulekit_clf.model.rules)

67 self.assertEqual(rules_count, listener.induced_rules_count)

68 self.assertEqual(rules_count, listener.on_progress_calls_count)

70 def test_getting_examples_coverage(self):

71 clf = classification.RuleClassifier()

72 x, y = load_iris(return_X_y=True)

74 clf.fit(x, y)

76 coverage_matrix = clf.get_coverage_matrix(x)

77 num_rows, num_cols = coverage_matrix.shape

79 self.assertEqual(

80 num_rows,

81 len(x),

82 "Coverage matrix should have as many rows as examples in dataset",

83 )

84 self.assertEqual(

85 num_cols,

86 len(clf.model.rules),

87 "Coverage matrix should have as many cols as rules in ruleset",

88 )

90 def test_classification_metrics(self):

91 clf = classification.RuleClassifier()

92 x, y = load_iris(return_X_y=True)

94 clf.fit(x, y)

95 y_pred, m = clf.predict(x, return_metrics=True)

96 self.assertEqual(len(y_pred), len(y))

97 self.assertIsNotNone(

98 m["rules_per_example"], "rules_per_example should be calculated"

99 )

100 self.assertIsNotNone(

101 m["voting_conflicts"], "rules_per_example should be calculated"

102 )

103

104 def test_score(self):

105 clf = classification.RuleClassifier()

106 X, y = load_iris(return_X_y=True)

107

108 clf.fit(X, y)

109 rulekit_acc: float = clf.score(X, y)

110 sklearn_acc: float = metrics.accuracy_score(y, clf.predict(X))

111

112 self.assertAlmostEqual(

113 rulekit_acc, sklearn_acc, places=3, msg="Accuracy should be the same"

114 )

115

116 def test_fit_on_integer_labels(self):

117 clf1 = classification.RuleClassifier()

118 clf2 = classification.RuleClassifier()

119 X, y_num = load_iris(return_X_y=True)

120 y_str: np.ndarray = y_num.astype(str)

121

122 clf1.fit(X, y_num)

123 clf2.fit(X, y_str)

124

125 self.assertTrue(

126 isinstance(clf1.predict(X)[0], float),

127 "Predictions should be of the same type as labels in the training set",

128 )

129 self.assertTrue(

130 isinstance(clf2.predict(X)[0], str),

131 "Predictions should be of the same type as labels in the training set",

132 )

133

134 def test_classification_predict_proba(self):

135 clf = classification.RuleClassifier()

136 x, y = load_iris(return_X_y=True)

137

138 clf.fit(x, y)

139 confidence_matrix, m = clf.predict_proba(x, return_metrics=True)

140 for row in confidence_matrix:

141 sum = 0

142 for col in row:

143 sum += col

144 self.assertAlmostEqual(sum, 1, 3, "Confidence matrix rows should sum to 1")

145

146 def test_prediction_results_mapping(self):

147 """

148 This method tests classifications on numeric labels which possible values does

149 not start from 0. RuleKit undehood maps all labels values to integer values starting

150 from 0 to N (counting by order of appearance in dataset). Those maped values must be

151 later remaped back to actual label value. This test verifies that predict method returns

152 correct (remaped) label value.

153 """

154 clf = classification.RuleClassifier()

155

156 # some trivial dataset - OR (2 = false, 3 = true)

157 x = np.array([[0, 1], [1, 1], [1, 0], [0, 0]])

158 y = np.array([0.0, 1.0, 0.0, 0.0])

159 clf.fit(x, y)

160 prediction = clf.predict(x)

161

162 self.assertEqual(y.all(), prediction.all())

163

164 def test_prediction_on_nominal_values(self):

165 clf = classification.RuleClassifier()

166

167 # some trivial dataset - AND Gate

168 x = np.array([[0, 1], [1, 1], [1, 0], [0, 0]])

169 y = np.array(["false", "true", "false", "false"])

170 clf.fit(x, y)

171 prediction = clf.predict(x)

172

173 self.assertTrue(np.array_equal(y, prediction))

174

175 def test_compare_with_java_results(self):

176 test_cases = get_test_cases("ClassificationSnCTest")

177

178 for test_case in test_cases:

179 params = test_case.induction_params

180 tree = classification.RuleClassifier(**params)

181 example_set = test_case.example_set

182 tree.fit(example_set.values, example_set.labels)

183 model = tree.model

184 expected = test_case.reference_report.rules

185 actual = list(map(str, model.rules))

186 assert_rules_are_equals(expected, actual)

187 assert_accuracy_is_greater(

188 tree.predict(example_set.values), example_set.labels, 0.9

189 )

190

191 def test_predict_proba(self):

192 test_case = get_test_cases("ClassificationSnCTest")[0]

193 params = test_case.induction_params

194 clf = classification.RuleClassifier(**params)

195 example_set = test_case.example_set

196 clf.fit(

197 example_set.values,

198 example_set.labels,

199 )

200 res = clf.predict_proba(example_set.values)

201 self.assertEqual(

202 res.shape[0],

203 example_set.values.shape[0],

204 "Should have as many rows as the original dataset",

205 )

206 self.assertEqual(

207 res.shape[1],

208 np.unique(example_set.labels).shape[0],

209 "Should have as many columns as there are classes in the dataset",

210 )

211 self.assertTrue(

212 res.max() <= 1 and res.min() >= 0,

213 "Predicted probabilities should be in range [0, 1]",

214 )

215

216 def test_fit_and_predict_on_boolean_columns(self):

217 test_case = get_test_cases("ClassificationSnCTest")[0]

218 params = test_case.induction_params

219 clf = classification.RuleClassifier(**params)

220 X, y = test_case.example_set.values, test_case.example_set.labels

221 X["boolean_column"] = np.random.randint(low=0, high=2, size=X.shape[0]).astype(

222 bool

223 )

224 clf.fit(X, y)

225 clf.predict(X)

226

227 y = y.astype(bool)

228 clf.fit(X, y)

229 clf.predict(X)

230

231 y = pd.Series(y)

232 clf.fit(X, y)

233 clf.predict(X)

234

235

236class TestExperClassifier(unittest.TestCase):

237

238 def test_compare_with_java_results(self):

239 test_cases = get_test_cases("ClassificationExpertSnCTest")

240

241 for test_case in test_cases:

242 params = test_case.induction_params

243 clf = classification.ExpertRuleClassifier(**params)

244 example_set = test_case.example_set

245 clf.fit(

246 example_set.values,

247 example_set.labels,

248 expert_rules=test_case.knowledge.expert_rules,

249 expert_preferred_conditions=test_case.knowledge.expert_preferred_conditions,

250 expert_forbidden_conditions=test_case.knowledge.expert_forbidden_conditions,

251 )

252 model = clf.model

253 expected = test_case.reference_report.rules

254 actual = list(map(str, model.rules))

255 assert_rules_are_equals(expected, actual)

256

257 def test_predict_proba(self):

258 test_case = get_test_cases("ClassificationExpertSnCTest")[0]

259 params = test_case.induction_params

260 clf = classification.ExpertRuleClassifier(**params)

261 example_set = test_case.example_set

262 clf.fit(

263 example_set.values,

264 example_set.labels,

265 expert_rules=test_case.knowledge.expert_rules,

266 expert_preferred_conditions=test_case.knowledge.expert_preferred_conditions,

267 expert_forbidden_conditions=test_case.knowledge.expert_forbidden_conditions,

268 )

269 res = clf.predict_proba(example_set.values)

270 self.assertEqual(

271 res.shape[0],

272 example_set.values.shape[0],

273 "Should have as many rows as the original dataset",

274 )

275 self.assertEqual(

276 res.shape[1],

277 np.unique(example_set.labels).shape[0],

278 "Should have as many columns as there are classes in the dataset",

279 )

280 self.assertTrue(

281 res.max() <= 1 and res.min() >= 0,

282 "Predicted probabilities should be in range [0, 1]",

283 )

284

285 # Issue #17

286 def test_left_open_intervals_in_expert_induction(self):

287 df = pd.DataFrame(

288 arff.loadarff(

289 f"{dir_path}/resources/data/seismic-bumps-train-minimal.arff"

290 )[0]

291 )

292 X = df.drop("class", axis=1)

293 y = df["class"]

294

295 expert_rules = [

296 ("rule-0", "IF [[gimpuls = <-inf, 750)]] THEN class = {0}"),

297 ("rule-1", "IF [[gimpuls = (750, inf)]] THEN class = {1}"),

298 ]

299

300 expert_preferred_conditions = [

301 ("preferred-condition-0", "1: IF [[seismic = {a}]] THEN class = {0}"),

302 ("preferred-attribute-0", "1: IF [[gimpuls = Any]] THEN class = {1}"),

303 ]

304

305 expert_forbidden_conditions = [

306 ("forb-attribute-0", "1: IF [[seismoacoustic = Any]] THEN class = {0}"),

307 ("forb-attribute-1", "inf: IF [[ghazard = Any]] THEN class = {1}"),

308 ]

309 clf = classification.ExpertRuleClassifier(

310 minsupp_new=8,

311 max_growing=0,

312 extend_using_preferred=True,

313 extend_using_automatic=True,

314 induce_using_preferred=True,

315 induce_using_automatic=True,

316 )

317 clf.fit(

318 X,

319 y,

320 expert_rules=expert_rules,

321 expert_preferred_conditions=expert_preferred_conditions,

322 expert_forbidden_conditions=expert_forbidden_conditions,

323 )

324

325 # @unittest.skip(

326 # "Skipped due to known bug https://bitbucket.org/polsl-a/rules/issues/126"

327 # )

328 def test_refining_conditions_for_nominal_attributes(self):

329 df: pd.DataFrame = pd.read_csv(

330 os.path.join(dir_path, "additional_resources", "salary.csv")

331 )

332 X, y = df.drop("Salary", axis=1), df["Salary"]

333

334 # Run experiment using python API

335 clf = classification.ExpertRuleClassifier(

336 induction_measure=Measures.C2,

337 pruning_measure=Measures.C2,

338 voting_measure=Measures.C2,

339 complementary_conditions=True,

340 extend_using_preferred=False,

341 extend_using_automatic=False,

342 induce_using_preferred=False,

343 induce_using_automatic=False,

344 preferred_conditions_per_rule=0,

345 preferred_attributes_per_rule=0,

346 consider_other_classes=False,

347 )

348 clf.fit(

349 X,

350 y,

351 expert_rules=[

352 (

353 "expert_rules-1",

354 (

355 "IF Age = (-inf, 33.0) AND Job Title @= {Software Engineer} THEN "

356 "Salary = {below average}"

357 ),

358 )

359 ],

360 )

361 self.assertEqual(

362 [

363 (

364 "IF [[Age = (-inf, 33)]] AND [[Job Title = {Software Engineer}]] THEN "

365 "Salary = {below average}"

366 )

367 ],

368 [str(r) for r in clf.model.rules],

369 (

370 "Ruleset should contain only a single rule configured by expert with "

371 "a refined condition"

372 ),

373 )

374 clf.fit(

375 X,

376 y,

377 expert_rules=[

378 (

379 "expert_rules-1",

380 "IF Age = (-inf, 33.0) AND Job Title @= Any THEN Salary = {below average}",

381 )

382 ],

383 )

384 self.assertEqual(

385 [

386 (

387 "IF [[Age = (-inf, 33)]] AND [[Job Title = {Marketing Analyst}]] "

388 "THEN Salary = {below average}"

389 )

390 ],

391 [str(r) for r in clf.model.rules],

392 (

393 "Ruleset should contain only a single rule configured by expert with "

394 "a refined condition"

395 ),

396 )

397

398

399if __name__ == "__main__":

400 unittest.main()