Coverage for tests/test_classifier.py: 99%
184 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-07 11:26 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-07 11:26 +0000
1import os
2import threading
3import unittest
5import numpy as np
6import pandas as pd
7import sklearn.tree as scikit
8from scipy.io import arff
9from sklearn import metrics
10from sklearn.datasets import load_iris
11from sklearn.preprocessing import LabelEncoder
13from rulekit import classification
14from rulekit.events import RuleInductionProgressListener
15from rulekit.params import Measures
16from rulekit.rules import ClassificationRule
17from tests.utils import assert_accuracy_is_greater
18from tests.utils import assert_rules_are_equals
19from tests.utils import dir_path
20from tests.utils import get_test_cases
23class TestClassifier(unittest.TestCase):
25 def test_classification_accuracy_on_iris(self):
26 scikit_clf = scikit.DecisionTreeClassifier()
27 rulekit_clf = classification.RuleClassifier()
28 x, y = load_iris(return_X_y=True)
30 scikit_clf.fit(x, y)
31 rulekit_clf.fit(x, y)
32 scikit_prediction = scikit_clf.predict(x)
33 rulekit_prediction = rulekit_clf.predict(x)
34 scikit_accuracy = metrics.accuracy_score(y, scikit_prediction)
35 rulekit_accuracy = metrics.accuracy_score(y, rulekit_prediction)
37 assert (
38 abs(scikit_accuracy - rulekit_accuracy) < 0.04
39 ), "RuleKit model should perform similar to scikit model"
41 def test_induction_progress_listener(self):
42 rulekit_clf = classification.RuleClassifier()
43 x, y = load_iris(return_X_y=True)
45 class EventListener(RuleInductionProgressListener):
47 lock = threading.Lock()
48 induced_rules_count = 0
49 on_progress_calls_count = 0
51 def on_new_rule(self, rule: ClassificationRule):
52 self.lock.acquire()
53 self.induced_rules_count += 1
54 self.lock.release()
56 def on_progress(
57 self, total_examples_count: int, uncovered_examples_count: int
58 ):
59 self.lock.acquire()
60 self.on_progress_calls_count += 1
61 self.lock.release()
63 listener = EventListener()
64 rulekit_clf.add_event_listener(listener)
65 rulekit_clf.fit(x, y)
66 rules_count = len(rulekit_clf.model.rules)
67 self.assertEqual(rules_count, listener.induced_rules_count)
68 self.assertEqual(rules_count, listener.on_progress_calls_count)
70 def test_getting_examples_coverage(self):
71 clf = classification.RuleClassifier()
72 x, y = load_iris(return_X_y=True)
74 clf.fit(x, y)
76 coverage_matrix = clf.get_coverage_matrix(x)
77 num_rows, num_cols = coverage_matrix.shape
79 self.assertEqual(
80 num_rows,
81 len(x),
82 "Coverage matrix should have as many rows as examples in dataset",
83 )
84 self.assertEqual(
85 num_cols,
86 len(clf.model.rules),
87 "Coverage matrix should have as many cols as rules in ruleset",
88 )
90 def test_classification_metrics(self):
91 clf = classification.RuleClassifier()
92 x, y = load_iris(return_X_y=True)
94 clf.fit(x, y)
95 y_pred, m = clf.predict(x, return_metrics=True)
96 self.assertEqual(len(y_pred), len(y))
97 self.assertIsNotNone(
98 m["rules_per_example"], "rules_per_example should be calculated"
99 )
100 self.assertIsNotNone(
101 m["voting_conflicts"], "rules_per_example should be calculated"
102 )
104 def test_score(self):
105 clf = classification.RuleClassifier()
106 X, y = load_iris(return_X_y=True)
108 clf.fit(X, y)
109 rulekit_acc: float = clf.score(X, y)
110 sklearn_acc: float = metrics.accuracy_score(y, clf.predict(X))
112 self.assertAlmostEqual(
113 rulekit_acc, sklearn_acc, places=3, msg="Accuracy should be the same"
114 )
116 def test_fit_on_integer_labels(self):
117 clf1 = classification.RuleClassifier()
118 clf2 = classification.RuleClassifier()
119 X, y_num = load_iris(return_X_y=True)
120 y_str: np.ndarray = y_num.astype(str)
122 clf1.fit(X, y_num)
123 clf2.fit(X, y_str)
125 self.assertTrue(
126 isinstance(clf1.predict(X)[0], float),
127 "Predictions should be of the same type as labels in the training set",
128 )
129 self.assertTrue(
130 isinstance(clf2.predict(X)[0], str),
131 "Predictions should be of the same type as labels in the training set",
132 )
134 def test_classification_predict_proba(self):
135 clf = classification.RuleClassifier()
136 x, y = load_iris(return_X_y=True)
138 clf.fit(x, y)
139 confidence_matrix, m = clf.predict_proba(x, return_metrics=True)
140 for row in confidence_matrix:
141 sum = 0
142 for col in row:
143 sum += col
144 self.assertAlmostEqual(sum, 1, 3, "Confidence matrix rows should sum to 1")
146 def test_prediction_results_mapping(self):
147 """
148 This method tests classifications on numeric labels which possible values does
149 not start from 0. RuleKit undehood maps all labels values to integer values starting
150 from 0 to N (counting by order of appearance in dataset). Those maped values must be
151 later remaped back to actual label value. This test verifies that predict method returns
152 correct (remaped) label value.
153 """
154 clf = classification.RuleClassifier()
156 # some trivial dataset - OR (2 = false, 3 = true)
157 x = np.array([[0, 1], [1, 1], [1, 0], [0, 0]])
158 y = np.array([0.0, 1.0, 0.0, 0.0])
159 clf.fit(x, y)
160 prediction = clf.predict(x)
162 self.assertEqual(y.all(), prediction.all())
164 def test_prediction_on_nominal_values(self):
165 clf = classification.RuleClassifier()
167 # some trivial dataset - AND Gate
168 x = np.array([[0, 1], [1, 1], [1, 0], [0, 0]])
169 y = np.array(["false", "true", "false", "false"])
170 clf.fit(x, y)
171 prediction = clf.predict(x)
173 self.assertTrue(np.array_equal(y, prediction))
175 def test_compare_with_java_results(self):
176 test_cases = get_test_cases("ClassificationSnCTest")
178 for test_case in test_cases:
179 params = test_case.induction_params
180 tree = classification.RuleClassifier(**params)
181 example_set = test_case.example_set
182 tree.fit(example_set.values, example_set.labels)
183 model = tree.model
184 expected = test_case.reference_report.rules
185 actual = list(map(str, model.rules))
186 assert_rules_are_equals(expected, actual)
187 assert_accuracy_is_greater(
188 tree.predict(example_set.values), example_set.labels, 0.9
189 )
191 def test_predict_proba(self):
192 test_case = get_test_cases("ClassificationSnCTest")[0]
193 params = test_case.induction_params
194 clf = classification.RuleClassifier(**params)
195 example_set = test_case.example_set
196 clf.fit(
197 example_set.values,
198 example_set.labels,
199 )
200 res = clf.predict_proba(example_set.values)
201 self.assertEqual(
202 res.shape[0],
203 example_set.values.shape[0],
204 "Should have as many rows as the original dataset",
205 )
206 self.assertEqual(
207 res.shape[1],
208 np.unique(example_set.labels).shape[0],
209 "Should have as many columns as there are classes in the dataset",
210 )
211 self.assertTrue(
212 res.max() <= 1 and res.min() >= 0,
213 "Predicted probabilities should be in range [0, 1]",
214 )
216 def test_fit_and_predict_on_boolean_columns(self):
217 test_case = get_test_cases("ClassificationSnCTest")[0]
218 params = test_case.induction_params
219 clf = classification.RuleClassifier(**params)
220 X, y = test_case.example_set.values, test_case.example_set.labels
221 X["boolean_column"] = np.random.randint(low=0, high=2, size=X.shape[0]).astype(
222 bool
223 )
224 clf.fit(X, y)
225 clf.predict(X)
227 y = y.astype(bool)
228 clf.fit(X, y)
229 clf.predict(X)
231 y = pd.Series(y)
232 clf.fit(X, y)
233 clf.predict(X)
236class TestExperClassifier(unittest.TestCase):
238 def test_compare_with_java_results(self):
239 test_cases = get_test_cases("ClassificationExpertSnCTest")
241 for test_case in test_cases:
242 params = test_case.induction_params
243 clf = classification.ExpertRuleClassifier(**params)
244 example_set = test_case.example_set
245 clf.fit(
246 example_set.values,
247 example_set.labels,
248 expert_rules=test_case.knowledge.expert_rules,
249 expert_preferred_conditions=test_case.knowledge.expert_preferred_conditions,
250 expert_forbidden_conditions=test_case.knowledge.expert_forbidden_conditions,
251 )
252 model = clf.model
253 expected = test_case.reference_report.rules
254 actual = list(map(str, model.rules))
255 assert_rules_are_equals(expected, actual)
257 def test_predict_proba(self):
258 test_case = get_test_cases("ClassificationExpertSnCTest")[0]
259 params = test_case.induction_params
260 clf = classification.ExpertRuleClassifier(**params)
261 example_set = test_case.example_set
262 clf.fit(
263 example_set.values,
264 example_set.labels,
265 expert_rules=test_case.knowledge.expert_rules,
266 expert_preferred_conditions=test_case.knowledge.expert_preferred_conditions,
267 expert_forbidden_conditions=test_case.knowledge.expert_forbidden_conditions,
268 )
269 res = clf.predict_proba(example_set.values)
270 self.assertEqual(
271 res.shape[0],
272 example_set.values.shape[0],
273 "Should have as many rows as the original dataset",
274 )
275 self.assertEqual(
276 res.shape[1],
277 np.unique(example_set.labels).shape[0],
278 "Should have as many columns as there are classes in the dataset",
279 )
280 self.assertTrue(
281 res.max() <= 1 and res.min() >= 0,
282 "Predicted probabilities should be in range [0, 1]",
283 )
285 # Issue #17
286 def test_left_open_intervals_in_expert_induction(self):
287 df = pd.DataFrame(
288 arff.loadarff(
289 f"{dir_path}/resources/data/seismic-bumps-train-minimal.arff"
290 )[0]
291 )
292 X = df.drop("class", axis=1)
293 y = df["class"]
295 expert_rules = [
296 ("rule-0", "IF [[gimpuls = <-inf, 750)]] THEN class = {0}"),
297 ("rule-1", "IF [[gimpuls = (750, inf)]] THEN class = {1}"),
298 ]
300 expert_preferred_conditions = [
301 ("preferred-condition-0", "1: IF [[seismic = {a}]] THEN class = {0}"),
302 ("preferred-attribute-0", "1: IF [[gimpuls = Any]] THEN class = {1}"),
303 ]
305 expert_forbidden_conditions = [
306 ("forb-attribute-0", "1: IF [[seismoacoustic = Any]] THEN class = {0}"),
307 ("forb-attribute-1", "inf: IF [[ghazard = Any]] THEN class = {1}"),
308 ]
309 clf = classification.ExpertRuleClassifier(
310 minsupp_new=8,
311 max_growing=0,
312 extend_using_preferred=True,
313 extend_using_automatic=True,
314 induce_using_preferred=True,
315 induce_using_automatic=True,
316 )
317 clf.fit(
318 X,
319 y,
320 expert_rules=expert_rules,
321 expert_preferred_conditions=expert_preferred_conditions,
322 expert_forbidden_conditions=expert_forbidden_conditions,
323 )
325 # @unittest.skip(
326 # "Skipped due to known bug https://bitbucket.org/polsl-a/rules/issues/126"
327 # )
328 def test_refining_conditions_for_nominal_attributes(self):
329 df: pd.DataFrame = pd.read_csv(
330 os.path.join(dir_path, "additional_resources", "salary.csv")
331 )
332 X, y = df.drop("Salary", axis=1), df["Salary"]
334 # Run experiment using python API
335 clf = classification.ExpertRuleClassifier(
336 induction_measure=Measures.C2,
337 pruning_measure=Measures.C2,
338 voting_measure=Measures.C2,
339 complementary_conditions=True,
340 extend_using_preferred=False,
341 extend_using_automatic=False,
342 induce_using_preferred=False,
343 induce_using_automatic=False,
344 preferred_conditions_per_rule=0,
345 preferred_attributes_per_rule=0,
346 consider_other_classes=False,
347 )
348 clf.fit(
349 X,
350 y,
351 expert_rules=[
352 (
353 "expert_rules-1",
354 (
355 "IF Age = (-inf, 33.0) AND Job Title @= {Software Engineer} THEN "
356 "Salary = {below average}"
357 ),
358 )
359 ],
360 )
361 self.assertEqual(
362 [
363 (
364 "IF [[Age = (-inf, 33)]] AND [[Job Title = {Software Engineer}]] THEN "
365 "Salary = {below average}"
366 )
367 ],
368 [str(r) for r in clf.model.rules],
369 (
370 "Ruleset should contain only a single rule configured by expert with "
371 "a refined condition"
372 ),
373 )
374 clf.fit(
375 X,
376 y,
377 expert_rules=[
378 (
379 "expert_rules-1",
380 "IF Age = (-inf, 33.0) AND Job Title @= Any THEN Salary = {below average}",
381 )
382 ],
383 )
384 self.assertEqual(
385 [
386 (
387 "IF [[Age = (-inf, 33)]] AND [[Job Title = {Marketing Analyst}]] "
388 "THEN Salary = {below average}"
389 )
390 ],
391 [str(r) for r in clf.model.rules],
392 (
393 "Ruleset should contain only a single rule configured by expert with "
394 "a refined condition"
395 ),
396 )
399if __name__ == "__main__":
400 unittest.main()