Coverage for rulekit/classification.py: 81%
185 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-07 11:26 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-07 11:26 +0000
1"""Module contaiing classes for classification analysis and prediction.
2"""
3from __future__ import annotations
5from enum import Enum
6from numbers import Number
7from typing import Tuple
8from typing import TypedDict
9from typing import Union
11import numpy as np
12import pandas as pd
13from jpype import JClass
14from jpype import JObject
15from sklearn import metrics
17from rulekit._helpers import PredictionResultMapper
18from rulekit._operator import BaseOperator
19from rulekit._operator import Data
20from rulekit._operator import ExpertKnowledgeOperator
21from rulekit._problem_types import ProblemType
22from rulekit.params import ContrastSetModelParams
23from rulekit.params import DEFAULT_PARAMS_VALUE
24from rulekit.params import ExpertModelParams
25from rulekit.params import Measures
26from rulekit.params import ModelsParams
27from rulekit.rules import ClassificationRule
28from rulekit.rules import RuleSet
31class ClassificationPredictionMetrics(TypedDict):
32 """Stores additional metrics for classification prediction.
34 Fields:
35 * rules_per_example (float): Average number of rules per example.
36 * voting_conflicts (_type_): Number of voting conflicts.
37 """
39 rules_per_example: float
40 voting_conflicts: float
43class _ClassificationParams(ModelsParams):
44 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE["control_apriori_precision"]
45 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"]
46 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"]
49class _ClassificationExpertParams(_ClassificationParams, ExpertModelParams):
50 pass
53class BaseClassifier:
54 """:meta private:"""
56 def __init__(self):
57 self._ClassificationRulesPerformance: JClass = (
58 None # pylint: disable=invalid-name
59 )
60 self._NegativeVotingConflictsPerformance: JClass = (
61 None # pylint: disable=invalid-name
62 )
63 self._init_classification_rule_performance_classes()
65 class MetricTypes(Enum):
66 """:meta private:"""
68 RulesPerExample = 1 # pylint: disable=invalid-name
69 VotingConflicts = 2 # pylint: disable=invalid-name
70 NegativeVotingConflicts = 3 # pylint: disable=invalid-name
72 def _init_classification_rule_performance_classes(self):
73 self._ClassificationRulesPerformance = JClass( # pylint: disable=invalid-name
74 "adaa.analytics.rules.logic.performance.ClassificationRulesPerformance"
75 )
77 def _calculate_metric(
78 self, example_set: JObject, metric_type: MetricTypes
79 ) -> float:
80 metric: JObject = self._ClassificationRulesPerformance(metric_type.value)
81 metric_value = float(metric.countExample(example_set).getValue())
82 return metric_value
84 def _calculate_prediction_metrics(
85 self, example_set
86 ) -> ClassificationPredictionMetrics:
87 return ClassificationPredictionMetrics(
88 rules_per_example=self._calculate_metric(
89 example_set, BaseClassifier.MetricTypes.RulesPerExample
90 ),
91 voting_conflicts=self._calculate_metric(
92 example_set, BaseClassifier.MetricTypes.VotingConflicts
93 ),
94 )
97class RuleClassifier(BaseOperator, BaseClassifier):
98 """Classification model."""
100 __params_class__ = _ClassificationParams
102 def __init__( # pylint: disable=too-many-arguments,too-many-locals
103 self,
104 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"],
105 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"],
106 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"],
107 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"],
108 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"],
109 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"],
110 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"],
111 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"],
112 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"],
113 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[
114 "complementary_conditions"
115 ],
116 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE[
117 "control_apriori_precision"
118 ],
119 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"],
120 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"],
121 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"],
122 ):
123 """
124 Parameters
125 ----------
126 minsupp_new : float = 5.0
127 a minimum number (or fraction, if value < 1.0) of previously uncovered
128 examples to be covered by a new rule (positive examples for classification
129 problems); default: 5,
130 induction_measure : :class:`rulekit.params.Measures` = :class:`rulekit.params.\
131 Measures.Correlation`
132 measure used during induction; default measure is correlation
133 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \
134 :class:`rulekit.params.Measures.Correlation`
135 measure used during pruning. Could be user defined (string), for example
136 :code:`2 * p / n`; default measure is correlation
137 voting_measure : :class:`rulekit.params.Measures` = \
138 :class:`rulekit.params.Measures.Correlation`
139 measure used during voting; default measure is correlation
140 max_growing : int = 0.0
141 non-negative integer representing maximum number of conditions which can be
142 added to the rule in the growing phase (use this parameter for large
143 datasets if execution time is prohibitive); 0 indicates no limit; default: 0
144 enable_pruning : bool = True
145 enable or disable pruning, default is True.
146 ignore_missing : bool = False
147 boolean telling whether missing values should be ignored (by default, a
148 missing valueof given attribute is always cconsidered as not fulfilling the
149 condition build upon that attribute); default: False.
150 max_uncovered_fraction : float = 0.0
151 Floating-point number from [0,1] interval representing maximum fraction of
152 examples that may remain uncovered by the rule set, default: 0.0.
153 select_best_candidate : bool = False
154 Flag determining if best candidate should be selected from growing phase;
155 default: False.
156 complementary_conditions : bool = False
157 If enabled, complementary conditions in the form a = !{value} for nominal
158 attributes are supported.
159 control_apriori_precision : bool = True
160 When inducing classification rules, verify if candidate precision is higher
161 than apriori precision of the investigated class.
162 max_rule_count : int = 0
163 Maximum number of rules to be generated (for classification data sets it
164 applies to a single class); 0 indicates no limit.
165 approximate_induction: bool = False
166 Use an approximate induction heuristic which does not check all possible
167 splits; note: this is an experimental feature and currently works only for
168 classification data sets, results may change in future;
169 approximate_bins_count: int = 100
170 maximum number of bins for an attribute evaluated in the approximate
171 induction.
172 """
173 BaseOperator.__init__(
174 self,
175 minsupp_new=minsupp_new,
176 induction_measure=induction_measure,
177 pruning_measure=pruning_measure,
178 voting_measure=voting_measure,
179 max_growing=max_growing,
180 enable_pruning=enable_pruning,
181 ignore_missing=ignore_missing,
182 max_uncovered_fraction=max_uncovered_fraction,
183 select_best_candidate=select_best_candidate,
184 complementary_conditions=complementary_conditions,
185 control_apriori_precision=control_apriori_precision,
186 max_rule_count=max_rule_count,
187 approximate_induction=approximate_induction,
188 approximate_bins_count=approximate_bins_count,
189 )
190 BaseClassifier.__init__(self)
191 self._remap_to_numeric = False
192 self.label_unique_values = []
193 self.model: RuleSet[ClassificationRule] = None
195 def _map_result(self, predicted_example_set) -> np.ndarray:
196 prediction: np.ndarray
197 if self._remap_to_numeric:
198 prediction = PredictionResultMapper.map_to_numerical(predicted_example_set)
199 else:
200 prediction = PredictionResultMapper.map_to_nominal(predicted_example_set)
201 return prediction
203 def _map_confidence(self, predicted_example_set) -> np.ndarray:
204 return PredictionResultMapper.map_confidence(
205 predicted_example_set, self.label_unique_values
206 )
208 def _get_unique_label_values(self, labels: Data):
209 tmp = {}
210 for label_value in labels:
211 tmp[label_value] = None
212 self.label_unique_values = list(tmp.keys())
213 if len(self.label_unique_values) > 0 and isinstance(
214 self.label_unique_values[0], bytes
215 ):
216 self.label_unique_values = [
217 item.decode("utf-8") for item in self.label_unique_values
218 ]
220 def _prepare_labels(self, labels: Data) -> Data:
221 if isinstance(labels, (pd.DataFrame, pd.Series)):
222 if labels.dtypes.name == "bool":
223 return labels.astype(str)
224 if isinstance(labels.iloc[0], Number):
225 self._remap_to_numeric = True
226 return labels.astype(str)
227 else:
228 if isinstance(labels[0], bool) or (
229 isinstance(labels, np.ndarray) and labels.dtype.name == "bool"
230 ):
231 return np.array(list(map(str, labels)))
232 if isinstance(labels[0], Number):
233 self._remap_to_numeric = True
234 return np.array(list(map(str, labels)))
235 return labels
237 def fit(
238 self, values: Data, labels: Data
239 ) -> RuleClassifier: # pylint: disable=arguments-differ
240 """Train model on given dataset.
242 Parameters
243 ----------
244 values : :class:`rulekit.operator.Data`
245 attributes
246 labels : :class:`rulekit.operator.Data`
247 labels
248 Returns
249 -------
250 self : RuleClassifier
251 """
252 self._get_unique_label_values(labels)
253 labels = self._prepare_labels(labels)
254 BaseOperator.fit(self, values, labels)
255 return self
257 def predict(
258 self, values: Data, return_metrics: bool = False
259 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]:
260 """Perform prediction and returns predicted labels.
262 Parameters
263 ----------
264 values : :class:`rulekit.operator.Data`
265 attributes
267 return_metrics: bool = False
268 Optional flag. If set to *True* method will calculate some additional model
269 metrics. Method will then return tuple instead of just predicted labels.
271 Returns
272 -------
273 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\
274 ClassificationPredictionMetrics`]]
275 If *return_metrics* flag wasn't set it will return just prediction,
276 otherwise a tuple will be returned with first element being prediction and
277 second one being metrics.
278 """
279 result_example_set = BaseOperator.predict(self, values)
280 y_pred = self._map_result(result_example_set)
281 if return_metrics:
282 metrics_values: dict = BaseClassifier._calculate_prediction_metrics(
283 self, result_example_set
284 )
285 return (y_pred, metrics_values)
286 return y_pred
288 def predict_proba(
289 self, values: Data, return_metrics: bool = False
290 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]:
291 """Perform prediction and returns class probabilities for each example.
293 Parameters
294 ----------
295 values : :class:`rulekit.operator.Data`
296 attributes
298 return_metrics: bool = False
299 Optional flag. If set to *True* method will calculate some additional model
300 metrics. Method will then return tuple instead of just probabilities.
302 Returns
303 -------
304 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\
305 ClassificationPredictionMetrics`]]
306 If *return_metrics* flag wasn't set it will return just probabilities
307 matrix, otherwise a tuple will be returned with first element being
308 prediction and second one being metrics.
309 """
310 result_example_set = BaseOperator.predict(self, values)
311 mapped_result_example_set = self._map_confidence(result_example_set)
312 if return_metrics:
313 metrics_values: dict = BaseClassifier._calculate_prediction_metrics(
314 self, result_example_set
315 )
316 return (mapped_result_example_set, metrics_values)
317 return mapped_result_example_set
319 def score(self, values: Data, labels: Data) -> float:
320 """Return the accuracy on the given test data and labels.
322 Parameters
323 ----------
324 values : :class:`rulekit.operator.Data`
325 attributes
326 labels : :class:`rulekit.operator.Data`
327 true labels
329 Returns
330 -------
331 score : float
332 Accuracy of self.predict(values) wrt. labels.
333 """
334 predicted_labels = self.predict(values)
335 return metrics.accuracy_score(labels, predicted_labels)
337 def __getstate__(self) -> dict:
338 return {
339 **BaseOperator.__getstate__(self),
340 **{
341 "label_unique_values": self.label_unique_values,
342 "_remap_to_numeric": self._remap_to_numeric,
343 },
344 }
346 def __setstate__(self, state: dict):
347 BaseOperator.__setstate__(self, state)
348 self._init_classification_rule_performance_classes()
349 self.label_unique_values = state["label_unique_values"]
350 self._remap_to_numeric = state["_remap_to_numeric"]
352 def _get_problem_type(self) -> ProblemType:
353 return ProblemType.CLASSIFICATION
356class ExpertRuleClassifier(ExpertKnowledgeOperator, RuleClassifier):
357 """Classification model using expert knowledge."""
359 __params_class__ = _ClassificationExpertParams
361 def __init__( # pylint: disable=too-many-arguments,too-many-locals
362 self,
363 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"],
364 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"],
365 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"],
366 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"],
367 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"],
368 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"],
369 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"],
370 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"],
371 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"],
372 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[
373 "complementary_conditions"
374 ],
375 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE[
376 "control_apriori_precision"
377 ],
378 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"],
379 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"],
380 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"],
381 extend_using_preferred: bool = DEFAULT_PARAMS_VALUE["extend_using_preferred"],
382 extend_using_automatic: bool = DEFAULT_PARAMS_VALUE["extend_using_automatic"],
383 induce_using_preferred: bool = DEFAULT_PARAMS_VALUE["induce_using_preferred"],
384 induce_using_automatic: bool = DEFAULT_PARAMS_VALUE["induce_using_automatic"],
385 consider_other_classes: bool = DEFAULT_PARAMS_VALUE["consider_other_classes"],
386 preferred_conditions_per_rule: int = DEFAULT_PARAMS_VALUE[
387 "preferred_conditions_per_rule"
388 ],
389 preferred_attributes_per_rule: int = DEFAULT_PARAMS_VALUE[
390 "preferred_attributes_per_rule"
391 ],
392 ):
393 """
394 Parameters
395 ----------
396 minsupp_new : float = 5.0
397 a minimum number (or fraction, if value < 1.0) of previously uncovered examples
398 to be covered by a new rule (positive examples for classification problems);
399 default: 5,
401 induction_measure : :class:`rulekit.params.Measures` = \
402 :class:`rulekit.params.Measures.Correlation`
403 measure used during induction; default measure is correlation
404 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \
405 :class:`rulekit.params.Measures.Correlation`
406 measure used during pruning. Could be user defined (string), for example
407 :code:`2 * p / n`; default measure is correlation
408 voting_measure : :class:`rulekit.params.Measures` = \
409 :class:`rulekit.params.Measures.Correlation`
410 measure used during voting; default measure is correlation
411 max_growing : int = 0.0
412 non-negative integer representing maximum number of conditions which can be
413 added to the rule in the growing phase (use this parameter for large
414 datasets if execution time is prohibitive); 0 indicates no limit; default: 0
415 enable_pruning : bool = True
416 enable or disable pruning, default is True.
417 ignore_missing : bool = False
418 boolean telling whether missing values should be ignored (by default, a
419 missing value of given attribute is always considered as not fulfilling the
420 condition build upon that attribute); default: False.
421 max_uncovered_fraction : float = 0.0
422 Floating-point number from [0,1] interval representing maximum fraction of
423 examples that may remain uncovered by the rule set, default: 0.0.
424 select_best_candidate : bool = False
425 Flag determining if best candidate should be selected from growing phase;
426 default: False.
427 complementary_conditions : bool = False
428 If enabled, complementary conditions in the form a = !{value} for nominal
429 attributes
430 are supported.
431 control_apriori_precision : bool = True
432 When inducing classification rules, verify if candidate precision is higher
433 than apriori precision of the investigated class.
434 max_rule_count : int = 0
435 Maximum number of rules to be generated (for classification data sets it
436 applies to a single class); 0 indicates no limit.
437 approximate_induction: bool = False
438 Use an approximate induction heuristic which does not check all possible
439 splits; note: this is an experimental feature and currently works only for
440 classification data sets, results may change in future;
441 approximate_bins_count: int = 100
442 maximum number of bins for an attribute evaluated in the approximate
443 induction.
445 extend_using_preferred : bool = False
446 boolean indicating whether initial rules should be extended with a use of
447 preferred conditions and attributes; default is False
448 extend_using_automatic : bool = False
449 boolean indicating whether initial rules should be extended with a use of
450 automatic conditions and attributes; default is False
451 induce_using_preferred : bool = False
452 boolean indicating whether new rules should be induced with a use of
453 preferred conditions and attributes; default is False
454 induce_using_automatic : bool = False
455 boolean indicating whether new rules should be induced with a use of
456 automatic conditions and attributes; default is False
457 consider_other_classes : bool = False
458 boolean indicating whether automatic induction should be performed for
459 classes for which no user's knowledge has been defined
460 (classification only); default is False.
461 preferred_conditions_per_rule : int = None
462 maximum number of preferred conditions per rule; default: unlimited,
463 preferred_attributes_per_rule : int = None
464 maximum number of preferred attributes per rule; default: unlimited.
465 """
466 self._remap_to_numeric = False
467 RuleClassifier.__init__(
468 self,
469 minsupp_new=minsupp_new,
470 induction_measure=induction_measure,
471 pruning_measure=pruning_measure,
472 voting_measure=voting_measure,
473 max_growing=max_growing,
474 enable_pruning=enable_pruning,
475 ignore_missing=ignore_missing,
476 max_uncovered_fraction=max_uncovered_fraction,
477 select_best_candidate=select_best_candidate,
478 complementary_conditions=complementary_conditions,
479 control_apriori_precision=control_apriori_precision,
480 max_rule_count=max_rule_count,
481 approximate_induction=approximate_induction,
482 approximate_bins_count=approximate_bins_count,
483 )
484 ExpertKnowledgeOperator.__init__(
485 self,
486 minsupp_new=minsupp_new,
487 induction_measure=induction_measure,
488 pruning_measure=pruning_measure,
489 voting_measure=voting_measure,
490 max_growing=max_growing,
491 enable_pruning=enable_pruning,
492 ignore_missing=ignore_missing,
493 max_uncovered_fraction=max_uncovered_fraction,
494 select_best_candidate=select_best_candidate,
495 complementary_conditions=complementary_conditions,
496 extend_using_preferred=extend_using_preferred,
497 extend_using_automatic=extend_using_automatic,
498 induce_using_preferred=induce_using_preferred,
499 induce_using_automatic=induce_using_automatic,
500 consider_other_classes=consider_other_classes,
501 preferred_conditions_per_rule=preferred_conditions_per_rule,
502 preferred_attributes_per_rule=preferred_attributes_per_rule,
503 control_apriori_precision=control_apriori_precision,
504 max_rule_count=max_rule_count,
505 approximate_induction=approximate_induction,
506 approximate_bins_count=approximate_bins_count,
507 )
508 self.model: RuleSet[ClassificationRule] = None
510 def fit( # pylint: disable=arguments-differ,too-many-arguments
511 self,
512 values: Data,
513 labels: Data,
514 expert_rules: list[Union[str, tuple[str, str]]] = None,
515 expert_preferred_conditions: list[Union[str, tuple[str, str]]] = None,
516 expert_forbidden_conditions: list[Union[str, tuple[str, str]]] = None,
517 ) -> ExpertRuleClassifier:
518 """Train model on given dataset.
520 Parameters
521 ----------
522 values : :class:`rulekit.operator.Data`
523 attributes
524 labels : :class:`rulekit.operator.Data`
525 labels
527 expert_rules : List[Union[str, Tuple[str, str]]]
528 set of initial rules, either passed as a list of strings representing rules
529 or as list of tuples where first element is name of the rule and second one
530 is rule string.
531 expert_preferred_conditions : List[Union[str, Tuple[str, str]]]
532 multiset of preferred conditions (used also for specifying preferred
533 attributes by using special value Any). Either passed as a list of strings
534 representing rules or as list of tuples where first element is name of the
535 rule and second one is rule string.
536 expert_forbidden_conditions : List[Union[str, Tuple[str, str]]]
537 set of forbidden conditions (used also for specifying forbidden attributes
538 by using special value Any). Either passed as a list of strings representing
539 rules or as list of tuples where first element is name of the rule and
540 second one is rule string.
541 Returns
542 -------
543 self : ExpertRuleClassifier
544 """
545 if isinstance(labels, (pd.DataFrame, pd.Series)):
546 if isinstance(labels.iloc[0], Number):
547 self._remap_to_numeric = True
548 labels = labels.astype(str)
549 else:
550 if isinstance(labels[0], Number):
551 self._remap_to_numeric = True
552 labels = list(map(str, labels))
553 self._get_unique_label_values(labels)
554 self._prepare_labels(labels)
555 return ExpertKnowledgeOperator.fit(
556 self,
557 values,
558 labels,
559 expert_rules=expert_rules,
560 expert_preferred_conditions=expert_preferred_conditions,
561 expert_forbidden_conditions=expert_forbidden_conditions,
562 )
564 def predict(
565 self, values: Data, return_metrics: bool = False
566 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]:
567 return RuleClassifier.predict(self, values, return_metrics)
569 def __getstate__(self) -> dict:
570 return {
571 **BaseOperator.__getstate__(self),
572 **{"_remap_to_numeric": self._remap_to_numeric},
573 }
575 def __setstate__(self, state: dict):
576 BaseOperator.__setstate__(self, state)
577 self._remap_to_numeric = state["_remap_to_numeric"]
579 def _get_problem_type(self) -> ProblemType:
580 return ProblemType.CLASSIFICATION
583class ContrastSetRuleClassifier(BaseOperator, BaseClassifier):
584 """Contrast set classification model."""
586 __params_class__ = ContrastSetModelParams
588 def __init__( # pylint: disable=too-many-arguments,too-many-locals
589 self,
590 minsupp_all: Tuple[float, float, float, float] = DEFAULT_PARAMS_VALUE[
591 "minsupp_all"
592 ],
593 max_neg2pos: float = DEFAULT_PARAMS_VALUE["max_neg2pos"],
594 max_passes_count: int = DEFAULT_PARAMS_VALUE["max_passes_count"],
595 penalty_strength: float = DEFAULT_PARAMS_VALUE["penalty_strength"],
596 penalty_saturation: float = DEFAULT_PARAMS_VALUE["penalty_saturation"],
597 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"],
598 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"],
599 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"],
600 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"],
601 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"],
602 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"],
603 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"],
604 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"],
605 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"],
606 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[
607 "complementary_conditions"
608 ],
609 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE[
610 "control_apriori_precision"
611 ],
612 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"],
613 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"],
614 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"],
615 ):
616 """
617 Parameters
618 ----------
619 minsupp_all: Tuple[float, float, float, float]
620 a minimum positive support of a contrast set (p/P). When multiple values
621 are specified, a metainduction is performed; Default and recommended
622 sequence is: 0.8, 0.5, 0.2, 0.1
623 max_neg2pos: float
624 a maximum ratio of negative to positive supports (nP/pN); Default is 0.5
625 max_passes_count: int
626 a maximum number of sequential covering passes for a single minsupp-all;
627 Default is 5
628 penalty_strength: float
629 (s) - penalty strength; Default is 0.5
630 penalty_saturation: float
631 the value of p_new / P at which penalty reward saturates; Default is 0.2.
632 minsupp_new : float = 5.0
633 a minimum number (or fraction, if value < 1.0) of previously uncovered
634 examples to be covered by a new rule (positive examples for classification
635 problems); default: 5,
636 induction_measure : :class:`rulekit.params.Measures` = \
637 :class:`rulekit.params.Measures.Correlation`
638 measure used during induction; default measure is correlation
639 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \
640 :class:`rulekit.params.Measures.Correlation`
641 measure used during pruning. Could be user defined (string), for example
642 :code:`2 * p / n`; default measure is correlation
643 voting_measure : :class:`rulekit.params.Measures` = \
644 :class:`rulekit.params.Measures.Correlation`
645 measure used during voting; default measure is correlation
646 max_growing : int = 0.0
647 non-negative integer representing maximum number of conditions which can be
648 added to the rule in the growing phase (use this parameter for large
649 datasets if execution time is prohibitive); 0 indicates no limit; default: 0
650 enable_pruning : bool = True
651 enable or disable pruning, default is True.
652 ignore_missing : bool = False
653 boolean telling whether missing values should be ignored (by default, a
654 missing value of given attribute is always considered as not fulfilling the
655 condition build upon that attribute); default: False.
656 max_uncovered_fraction : float = 0.0
657 Floating-point number from [0,1] interval representing maximum fraction of
658 examples that may remain uncovered by the rule set, default: 0.0.
659 select_best_candidate : bool = False
660 Flag determining if best candidate should be selected from growing phase;
661 default: False.
662 complementary_conditions : bool = False
663 If enabled, complementary conditions in the form a = !{value} for nominal
664 attributes are supported.
665 control_apriori_precision : bool = True
666 When inducing classification rules, verify if candidate precision is higher
667 than apriori precision of the investigated class.
668 max_rule_count : int = 0
669 Maximum number of rules to be generated (for classification data sets it
670 applies to a single class); 0 indicates no limit.
671 approximate_induction: bool = False
672 Use an approximate induction heuristic which does not check all possible
673 splits; note: this is an experimental feature and currently works only for
674 classification data sets, results may change in future;
675 approximate_bins_count: int = 100
676 maximum number of bins for an attribute evaluated in the approximate
677 induction.
678 """
679 BaseOperator.__init__(
680 self,
681 minsupp_all=minsupp_all,
682 max_neg2pos=max_neg2pos,
683 max_passes_count=max_passes_count,
684 penalty_strength=penalty_strength,
685 penalty_saturation=penalty_saturation,
686 minsupp_new=minsupp_new,
687 induction_measure=induction_measure,
688 pruning_measure=pruning_measure,
689 voting_measure=voting_measure,
690 max_growing=max_growing,
691 enable_pruning=enable_pruning,
692 ignore_missing=ignore_missing,
693 max_uncovered_fraction=max_uncovered_fraction,
694 select_best_candidate=select_best_candidate,
695 complementary_conditions=complementary_conditions,
696 control_apriori_precision=control_apriori_precision,
697 max_rule_count=max_rule_count,
698 approximate_induction=approximate_induction,
699 approximate_bins_count=approximate_bins_count,
700 )
701 BaseClassifier.__init__(self)
702 self.contrast_attribute: str = None
703 self._remap_to_numeric = False
704 self.label_unique_values = []
705 self.model: RuleSet[ClassificationRule] = None
707 def _map_result(self, predicted_example_set) -> np.ndarray:
708 prediction: np.ndarray
709 if self._remap_to_numeric:
710 prediction = PredictionResultMapper.map_to_numerical(predicted_example_set)
711 else:
712 prediction = PredictionResultMapper.map_to_nominal(predicted_example_set)
713 return prediction
715 def _get_unique_label_values(self, labels: Data):
716 tmp = {}
717 for label_value in labels:
718 tmp[label_value] = None
719 self.label_unique_values = list(tmp.keys())
720 if len(self.label_unique_values) > 0 and isinstance(
721 self.label_unique_values[0], bytes
722 ):
723 self.label_unique_values = [
724 item.decode("utf-8") for item in self.label_unique_values
725 ]
727 def fit(
728 self, values: Data, labels: Data, contrast_attribute: str
729 ) -> ContrastSetRuleClassifier: # pylint: disable=arguments-differ
730 """Train model on given dataset.
732 Parameters
733 ----------
734 values : :class:`rulekit.operator.Data`
735 attributes
736 labels : :class:`rulekit.operator.Data`
737 labels
738 contrast_attribute: str
739 group attribute
740 Returns
741 -------
742 self : ContrastSetRuleClassifier
743 """
744 RuleClassifier._get_unique_label_values( # pylint: disable=protected-access
745 self, labels
746 )
747 RuleClassifier._prepare_labels( # pylint: disable=protected-access,protected-access
748 self, labels
749 )
750 BaseOperator.fit(self, values, labels, contrast_attribute=contrast_attribute)
751 self.contrast_attribute = contrast_attribute
752 return self
754 def predict(
755 self, values: Data, return_metrics: bool = False
756 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]:
757 """Perform prediction and returns predicted labels.
759 Parameters
760 ----------
761 values : :class:`rulekit.operator.Data`
762 attributes
764 return_metrics: bool = False
765 Optional flag. If set to *True* method will calculate some additional model
766 metrics. Method will then return tuple instead of just predicted labels.
768 Returns
769 -------
770 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\
771 ClassificationPredictionMetrics`]]
772 If *return_metrics* flag wasn't set it will return just prediction,
773 otherwise a tuple will be returned with first element being prediction and
774 second one being metrics.
775 """
776 return RuleClassifier.predict(self, values, return_metrics)
778 def predict_proba(
779 self, values: Data, return_metrics: bool = False
780 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]:
781 """Perform prediction and returns class probabilities for each example.
783 Parameters
784 ----------
785 values : :class:`rulekit.operator.Data`
786 attributes
788 return_metrics: bool = False
789 Optional flag. If set to *True* method will calculate some additional model
790 metrics. Method will then return tuple instead of just probabilities.
792 Returns
793 -------
794 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\
795 ClassificationPredictionMetrics`]]
796 If *return_metrics* flag wasn't set it will return just probabilities
797 matrix, otherwise a tuple will be returned with first element being
798 prediction and second one being metrics.
799 """
800 return RuleClassifier.predict_proba(self, values, return_metrics)
802 def score(self, values: Data, labels: Data) -> float:
803 """Return the accuracy on the given test data and labels.
805 Parameters
806 ----------
807 values : :class:`rulekit.operator.Data`
808 attributes
809 labels : :class:`rulekit.operator.Data`
810 true labels
812 Returns
813 -------
814 score : float
815 Accuracy of self.predict(values) wrt. labels.
816 """
817 return RuleClassifier.score(self, values, labels)
819 def __getstate__(self) -> dict:
820 return {
821 **BaseOperator.__getstate__(self),
822 **{
823 "label_unique_values": self.label_unique_values,
824 "_remap_to_numeric": self._remap_to_numeric,
825 "contrast_attribute": self.contrast_attribute,
826 },
827 }
829 def __setstate__(self, state: dict):
830 BaseOperator.__setstate__(self, state)
831 self._init_classification_rule_performance_classes()
832 self.label_unique_values = state["label_unique_values"]
833 self._remap_to_numeric = state["_remap_to_numeric"]
834 self.contrast_attribute = state["contrast_attribute"]
836 def _get_problem_type(self) -> ProblemType:
837 return ProblemType.CONTRAST_CLASSIFICATION