Coverage for rulekit/classification.py: 81%

1"""Module contaiing classes for classification analysis and prediction.

2"""

3from __future__ import annotations

5from enum import Enum

6from numbers import Number

7from typing import Tuple

8from typing import TypedDict

9from typing import Union

11import numpy as np

12import pandas as pd

13from jpype import JClass

14from jpype import JObject

15from sklearn import metrics

17from rulekit._helpers import PredictionResultMapper

18from rulekit._operator import BaseOperator

19from rulekit._operator import Data

20from rulekit._operator import ExpertKnowledgeOperator

21from rulekit._problem_types import ProblemType

22from rulekit.params import ContrastSetModelParams

23from rulekit.params import DEFAULT_PARAMS_VALUE

24from rulekit.params import ExpertModelParams

25from rulekit.params import Measures

26from rulekit.params import ModelsParams

27from rulekit.rules import ClassificationRule

28from rulekit.rules import RuleSet

31class ClassificationPredictionMetrics(TypedDict):

32 """Stores additional metrics for classification prediction.

34 Fields:

35 * rules_per_example (float): Average number of rules per example.

36 * voting_conflicts (_type_): Number of voting conflicts.

37 """

39 rules_per_example: float

40 voting_conflicts: float

43class _ClassificationParams(ModelsParams):

44 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE["control_apriori_precision"]

45 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"]

46 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"]

49class _ClassificationExpertParams(_ClassificationParams, ExpertModelParams):

50 pass

53class BaseClassifier:

54 """:meta private:"""

56 def __init__(self):

57 self._ClassificationRulesPerformance: JClass = (

58 None # pylint: disable=invalid-name

59 )

60 self._NegativeVotingConflictsPerformance: JClass = (

61 None # pylint: disable=invalid-name

62 )

63 self._init_classification_rule_performance_classes()

65 class MetricTypes(Enum):

66 """:meta private:"""

68 RulesPerExample = 1 # pylint: disable=invalid-name

69 VotingConflicts = 2 # pylint: disable=invalid-name

70 NegativeVotingConflicts = 3 # pylint: disable=invalid-name

72 def _init_classification_rule_performance_classes(self):

73 self._ClassificationRulesPerformance = JClass( # pylint: disable=invalid-name

74 "adaa.analytics.rules.logic.performance.ClassificationRulesPerformance"

75 )

77 def _calculate_metric(

78 self, example_set: JObject, metric_type: MetricTypes

79 ) -> float:

80 metric: JObject = self._ClassificationRulesPerformance(metric_type.value)

81 metric_value = float(metric.countExample(example_set).getValue())

82 return metric_value

84 def _calculate_prediction_metrics(

85 self, example_set

86 ) -> ClassificationPredictionMetrics:

87 return ClassificationPredictionMetrics(

88 rules_per_example=self._calculate_metric(

89 example_set, BaseClassifier.MetricTypes.RulesPerExample

90 ),

91 voting_conflicts=self._calculate_metric(

92 example_set, BaseClassifier.MetricTypes.VotingConflicts

93 ),

94 )

97class RuleClassifier(BaseOperator, BaseClassifier):

98 """Classification model."""

100 __params_class__ = _ClassificationParams

101

102 def __init__( # pylint: disable=too-many-arguments,too-many-locals

103 self,

104 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"],

105 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"],

106 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"],

107 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"],

108 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"],

109 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"],

110 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"],

111 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"],

112 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"],

113 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[

114 "complementary_conditions"

115 ],

116 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE[

117 "control_apriori_precision"

118 ],

119 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"],

120 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"],

121 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"],

122 ):

123 """

124 Parameters

125 ----------

126 minsupp_new : float = 5.0

127 a minimum number (or fraction, if value < 1.0) of previously uncovered

128 examples to be covered by a new rule (positive examples for classification

129 problems); default: 5,

130 induction_measure : :class:`rulekit.params.Measures` = :class:`rulekit.params.\

131 Measures.Correlation`

132 measure used during induction; default measure is correlation

133 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \

134 :class:`rulekit.params.Measures.Correlation`

135 measure used during pruning. Could be user defined (string), for example

136 :code:`2 * p / n`; default measure is correlation

137 voting_measure : :class:`rulekit.params.Measures` = \

138 :class:`rulekit.params.Measures.Correlation`

139 measure used during voting; default measure is correlation

140 max_growing : int = 0.0

141 non-negative integer representing maximum number of conditions which can be

142 added to the rule in the growing phase (use this parameter for large

143 datasets if execution time is prohibitive); 0 indicates no limit; default: 0

144 enable_pruning : bool = True

145 enable or disable pruning, default is True.

146 ignore_missing : bool = False

147 boolean telling whether missing values should be ignored (by default, a

148 missing valueof given attribute is always cconsidered as not fulfilling the

149 condition build upon that attribute); default: False.

150 max_uncovered_fraction : float = 0.0

151 Floating-point number from [0,1] interval representing maximum fraction of

152 examples that may remain uncovered by the rule set, default: 0.0.

153 select_best_candidate : bool = False

154 Flag determining if best candidate should be selected from growing phase;

155 default: False.

156 complementary_conditions : bool = False

157 If enabled, complementary conditions in the form a = !{value} for nominal

158 attributes are supported.

159 control_apriori_precision : bool = True

160 When inducing classification rules, verify if candidate precision is higher

161 than apriori precision of the investigated class.

162 max_rule_count : int = 0

163 Maximum number of rules to be generated (for classification data sets it

164 applies to a single class); 0 indicates no limit.

165 approximate_induction: bool = False

166 Use an approximate induction heuristic which does not check all possible

167 splits; note: this is an experimental feature and currently works only for

168 classification data sets, results may change in future;

169 approximate_bins_count: int = 100

170 maximum number of bins for an attribute evaluated in the approximate

171 induction.

172 """

173 BaseOperator.__init__(

174 self,

175 minsupp_new=minsupp_new,

176 induction_measure=induction_measure,

177 pruning_measure=pruning_measure,

178 voting_measure=voting_measure,

179 max_growing=max_growing,

180 enable_pruning=enable_pruning,

181 ignore_missing=ignore_missing,

182 max_uncovered_fraction=max_uncovered_fraction,

183 select_best_candidate=select_best_candidate,

184 complementary_conditions=complementary_conditions,

185 control_apriori_precision=control_apriori_precision,

186 max_rule_count=max_rule_count,

187 approximate_induction=approximate_induction,

188 approximate_bins_count=approximate_bins_count,

189 )

190 BaseClassifier.__init__(self)

191 self._remap_to_numeric = False

192 self.label_unique_values = []

193 self.model: RuleSet[ClassificationRule] = None

194

195 def _map_result(self, predicted_example_set) -> np.ndarray:

196 prediction: np.ndarray

197 if self._remap_to_numeric:

198 prediction = PredictionResultMapper.map_to_numerical(predicted_example_set)

199 else:

200 prediction = PredictionResultMapper.map_to_nominal(predicted_example_set)

201 return prediction

202

203 def _map_confidence(self, predicted_example_set) -> np.ndarray:

204 return PredictionResultMapper.map_confidence(

205 predicted_example_set, self.label_unique_values

206 )

207

208 def _get_unique_label_values(self, labels: Data):

209 tmp = {}

210 for label_value in labels:

211 tmp[label_value] = None

212 self.label_unique_values = list(tmp.keys())

213 if len(self.label_unique_values) > 0 and isinstance(

214 self.label_unique_values[0], bytes

215 ):

216 self.label_unique_values = [

217 item.decode("utf-8") for item in self.label_unique_values

218 ]

219

220 def _prepare_labels(self, labels: Data) -> Data:

221 if isinstance(labels, (pd.DataFrame, pd.Series)):

222 if labels.dtypes.name == "bool":

223 return labels.astype(str)

224 if isinstance(labels.iloc[0], Number):

225 self._remap_to_numeric = True

226 return labels.astype(str)

227 else:

228 if isinstance(labels[0], bool) or (

229 isinstance(labels, np.ndarray) and labels.dtype.name == "bool"

230 ):

231 return np.array(list(map(str, labels)))

232 if isinstance(labels[0], Number):

233 self._remap_to_numeric = True

234 return np.array(list(map(str, labels)))

235 return labels

236

237 def fit(

238 self, values: Data, labels: Data

239 ) -> RuleClassifier: # pylint: disable=arguments-differ

240 """Train model on given dataset.

241

242 Parameters

243 ----------

244 values : :class:`rulekit.operator.Data`

245 attributes

246 labels : :class:`rulekit.operator.Data`

247 labels

248 Returns

249 -------

250 self : RuleClassifier

251 """

252 self._get_unique_label_values(labels)

253 labels = self._prepare_labels(labels)

254 BaseOperator.fit(self, values, labels)

255 return self

256

257 def predict(

258 self, values: Data, return_metrics: bool = False

259 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]:

260 """Perform prediction and returns predicted labels.

261

262 Parameters

263 ----------

264 values : :class:`rulekit.operator.Data`

265 attributes

266

267 return_metrics: bool = False

268 Optional flag. If set to *True* method will calculate some additional model

269 metrics. Method will then return tuple instead of just predicted labels.

270

271 Returns

272 -------

273 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\

274 ClassificationPredictionMetrics`]]

275 If *return_metrics* flag wasn't set it will return just prediction,

276 otherwise a tuple will be returned with first element being prediction and

277 second one being metrics.

278 """

279 result_example_set = BaseOperator.predict(self, values)

280 y_pred = self._map_result(result_example_set)

281 if return_metrics:

282 metrics_values: dict = BaseClassifier._calculate_prediction_metrics(

283 self, result_example_set

284 )

285 return (y_pred, metrics_values)

286 return y_pred

287

288 def predict_proba(

289 self, values: Data, return_metrics: bool = False

290 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]:

291 """Perform prediction and returns class probabilities for each example.

292

293 Parameters

294 ----------

295 values : :class:`rulekit.operator.Data`

296 attributes

297

298 return_metrics: bool = False

299 Optional flag. If set to *True* method will calculate some additional model

300 metrics. Method will then return tuple instead of just probabilities.

301

302 Returns

303 -------

304 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\

305 ClassificationPredictionMetrics`]]

306 If *return_metrics* flag wasn't set it will return just probabilities

307 matrix, otherwise a tuple will be returned with first element being

308 prediction and second one being metrics.

309 """

310 result_example_set = BaseOperator.predict(self, values)

311 mapped_result_example_set = self._map_confidence(result_example_set)

312 if return_metrics:

313 metrics_values: dict = BaseClassifier._calculate_prediction_metrics(

314 self, result_example_set

315 )

316 return (mapped_result_example_set, metrics_values)

317 return mapped_result_example_set

318

319 def score(self, values: Data, labels: Data) -> float:

320 """Return the accuracy on the given test data and labels.

321

322 Parameters

323 ----------

324 values : :class:`rulekit.operator.Data`

325 attributes

326 labels : :class:`rulekit.operator.Data`

327 true labels

328

329 Returns

330 -------

331 score : float

332 Accuracy of self.predict(values) wrt. labels.

333 """

334 predicted_labels = self.predict(values)

335 return metrics.accuracy_score(labels, predicted_labels)

336

337 def __getstate__(self) -> dict:

338 return {

339 **BaseOperator.__getstate__(self),

340 **{

341 "label_unique_values": self.label_unique_values,

342 "_remap_to_numeric": self._remap_to_numeric,

343 },

344 }

345

346 def __setstate__(self, state: dict):

347 BaseOperator.__setstate__(self, state)

348 self._init_classification_rule_performance_classes()

349 self.label_unique_values = state["label_unique_values"]

350 self._remap_to_numeric = state["_remap_to_numeric"]

351

352 def _get_problem_type(self) -> ProblemType:

353 return ProblemType.CLASSIFICATION

354

355

356class ExpertRuleClassifier(ExpertKnowledgeOperator, RuleClassifier):

357 """Classification model using expert knowledge."""

358

359 __params_class__ = _ClassificationExpertParams

360

361 def __init__( # pylint: disable=too-many-arguments,too-many-locals

362 self,

363 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"],

364 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"],

365 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"],

366 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"],

367 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"],

368 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"],

369 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"],

370 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"],

371 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"],

372 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[

373 "complementary_conditions"

374 ],

375 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE[

376 "control_apriori_precision"

377 ],

378 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"],

379 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"],

380 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"],

381 extend_using_preferred: bool = DEFAULT_PARAMS_VALUE["extend_using_preferred"],

382 extend_using_automatic: bool = DEFAULT_PARAMS_VALUE["extend_using_automatic"],

383 induce_using_preferred: bool = DEFAULT_PARAMS_VALUE["induce_using_preferred"],

384 induce_using_automatic: bool = DEFAULT_PARAMS_VALUE["induce_using_automatic"],

385 consider_other_classes: bool = DEFAULT_PARAMS_VALUE["consider_other_classes"],

386 preferred_conditions_per_rule: int = DEFAULT_PARAMS_VALUE[

387 "preferred_conditions_per_rule"

388 ],

389 preferred_attributes_per_rule: int = DEFAULT_PARAMS_VALUE[

390 "preferred_attributes_per_rule"

391 ],

392 ):

393 """

394 Parameters

395 ----------

396 minsupp_new : float = 5.0

397 a minimum number (or fraction, if value < 1.0) of previously uncovered examples

398 to be covered by a new rule (positive examples for classification problems);

399 default: 5,

400

401 induction_measure : :class:`rulekit.params.Measures` = \

402 :class:`rulekit.params.Measures.Correlation`

403 measure used during induction; default measure is correlation

404 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \

405 :class:`rulekit.params.Measures.Correlation`

406 measure used during pruning. Could be user defined (string), for example

407 :code:`2 * p / n`; default measure is correlation

408 voting_measure : :class:`rulekit.params.Measures` = \

409 :class:`rulekit.params.Measures.Correlation`

410 measure used during voting; default measure is correlation

411 max_growing : int = 0.0

412 non-negative integer representing maximum number of conditions which can be

413 added to the rule in the growing phase (use this parameter for large

414 datasets if execution time is prohibitive); 0 indicates no limit; default: 0

415 enable_pruning : bool = True

416 enable or disable pruning, default is True.

417 ignore_missing : bool = False

418 boolean telling whether missing values should be ignored (by default, a

419 missing value of given attribute is always considered as not fulfilling the

420 condition build upon that attribute); default: False.

421 max_uncovered_fraction : float = 0.0

422 Floating-point number from [0,1] interval representing maximum fraction of

423 examples that may remain uncovered by the rule set, default: 0.0.

424 select_best_candidate : bool = False

425 Flag determining if best candidate should be selected from growing phase;

426 default: False.

427 complementary_conditions : bool = False

428 If enabled, complementary conditions in the form a = !{value} for nominal

429 attributes

430 are supported.

431 control_apriori_precision : bool = True

432 When inducing classification rules, verify if candidate precision is higher

433 than apriori precision of the investigated class.

434 max_rule_count : int = 0

435 Maximum number of rules to be generated (for classification data sets it

436 applies to a single class); 0 indicates no limit.

437 approximate_induction: bool = False

438 Use an approximate induction heuristic which does not check all possible

439 splits; note: this is an experimental feature and currently works only for

440 classification data sets, results may change in future;

441 approximate_bins_count: int = 100

442 maximum number of bins for an attribute evaluated in the approximate

443 induction.

444

445 extend_using_preferred : bool = False

446 boolean indicating whether initial rules should be extended with a use of

447 preferred conditions and attributes; default is False

448 extend_using_automatic : bool = False

449 boolean indicating whether initial rules should be extended with a use of

450 automatic conditions and attributes; default is False

451 induce_using_preferred : bool = False

452 boolean indicating whether new rules should be induced with a use of

453 preferred conditions and attributes; default is False

454 induce_using_automatic : bool = False

455 boolean indicating whether new rules should be induced with a use of

456 automatic conditions and attributes; default is False

457 consider_other_classes : bool = False

458 boolean indicating whether automatic induction should be performed for

459 classes for which no user's knowledge has been defined

460 (classification only); default is False.

461 preferred_conditions_per_rule : int = None

462 maximum number of preferred conditions per rule; default: unlimited,

463 preferred_attributes_per_rule : int = None

464 maximum number of preferred attributes per rule; default: unlimited.

465 """

466 self._remap_to_numeric = False

467 RuleClassifier.__init__(

468 self,

469 minsupp_new=minsupp_new,

470 induction_measure=induction_measure,

471 pruning_measure=pruning_measure,

472 voting_measure=voting_measure,

473 max_growing=max_growing,

474 enable_pruning=enable_pruning,

475 ignore_missing=ignore_missing,

476 max_uncovered_fraction=max_uncovered_fraction,

477 select_best_candidate=select_best_candidate,

478 complementary_conditions=complementary_conditions,

479 control_apriori_precision=control_apriori_precision,

480 max_rule_count=max_rule_count,

481 approximate_induction=approximate_induction,

482 approximate_bins_count=approximate_bins_count,

483 )

484 ExpertKnowledgeOperator.__init__(

485 self,

486 minsupp_new=minsupp_new,

487 induction_measure=induction_measure,

488 pruning_measure=pruning_measure,

489 voting_measure=voting_measure,

490 max_growing=max_growing,

491 enable_pruning=enable_pruning,

492 ignore_missing=ignore_missing,

493 max_uncovered_fraction=max_uncovered_fraction,

494 select_best_candidate=select_best_candidate,

495 complementary_conditions=complementary_conditions,

496 extend_using_preferred=extend_using_preferred,

497 extend_using_automatic=extend_using_automatic,

498 induce_using_preferred=induce_using_preferred,

499 induce_using_automatic=induce_using_automatic,

500 consider_other_classes=consider_other_classes,

501 preferred_conditions_per_rule=preferred_conditions_per_rule,

502 preferred_attributes_per_rule=preferred_attributes_per_rule,

503 control_apriori_precision=control_apriori_precision,

504 max_rule_count=max_rule_count,

505 approximate_induction=approximate_induction,

506 approximate_bins_count=approximate_bins_count,

507 )

508 self.model: RuleSet[ClassificationRule] = None

509

510 def fit( # pylint: disable=arguments-differ,too-many-arguments

511 self,

512 values: Data,

513 labels: Data,

514 expert_rules: list[Union[str, tuple[str, str]]] = None,

515 expert_preferred_conditions: list[Union[str, tuple[str, str]]] = None,

516 expert_forbidden_conditions: list[Union[str, tuple[str, str]]] = None,

517 ) -> ExpertRuleClassifier:

518 """Train model on given dataset.

519

520 Parameters

521 ----------

522 values : :class:`rulekit.operator.Data`

523 attributes

524 labels : :class:`rulekit.operator.Data`

525 labels

526

527 expert_rules : List[Union[str, Tuple[str, str]]]

528 set of initial rules, either passed as a list of strings representing rules

529 or as list of tuples where first element is name of the rule and second one

530 is rule string.

531 expert_preferred_conditions : List[Union[str, Tuple[str, str]]]

532 multiset of preferred conditions (used also for specifying preferred

533 attributes by using special value Any). Either passed as a list of strings

534 representing rules or as list of tuples where first element is name of the

535 rule and second one is rule string.

536 expert_forbidden_conditions : List[Union[str, Tuple[str, str]]]

537 set of forbidden conditions (used also for specifying forbidden attributes

538 by using special value Any). Either passed as a list of strings representing

539 rules or as list of tuples where first element is name of the rule and

540 second one is rule string.

541 Returns

542 -------

543 self : ExpertRuleClassifier

544 """

545 if isinstance(labels, (pd.DataFrame, pd.Series)):

546 if isinstance(labels.iloc[0], Number):

547 self._remap_to_numeric = True

548 labels = labels.astype(str)

549 else:

550 if isinstance(labels[0], Number):

551 self._remap_to_numeric = True

552 labels = list(map(str, labels))

553 self._get_unique_label_values(labels)

554 self._prepare_labels(labels)

555 return ExpertKnowledgeOperator.fit(

556 self,

557 values,

558 labels,

559 expert_rules=expert_rules,

560 expert_preferred_conditions=expert_preferred_conditions,

561 expert_forbidden_conditions=expert_forbidden_conditions,

562 )

563

564 def predict(

565 self, values: Data, return_metrics: bool = False

566 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]:

567 return RuleClassifier.predict(self, values, return_metrics)

568

569 def __getstate__(self) -> dict:

570 return {

571 **BaseOperator.__getstate__(self),

572 **{"_remap_to_numeric": self._remap_to_numeric},

573 }

574

575 def __setstate__(self, state: dict):

576 BaseOperator.__setstate__(self, state)

577 self._remap_to_numeric = state["_remap_to_numeric"]

578

579 def _get_problem_type(self) -> ProblemType:

580 return ProblemType.CLASSIFICATION

581

582

583class ContrastSetRuleClassifier(BaseOperator, BaseClassifier):

584 """Contrast set classification model."""

585

586 __params_class__ = ContrastSetModelParams

587

588 def __init__( # pylint: disable=too-many-arguments,too-many-locals

589 self,

590 minsupp_all: Tuple[float, float, float, float] = DEFAULT_PARAMS_VALUE[

591 "minsupp_all"

592 ],

593 max_neg2pos: float = DEFAULT_PARAMS_VALUE["max_neg2pos"],

594 max_passes_count: int = DEFAULT_PARAMS_VALUE["max_passes_count"],

595 penalty_strength: float = DEFAULT_PARAMS_VALUE["penalty_strength"],

596 penalty_saturation: float = DEFAULT_PARAMS_VALUE["penalty_saturation"],

597 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"],

598 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"],

599 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"],

600 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"],

601 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"],

602 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"],

603 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"],

604 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"],

605 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"],

606 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[

607 "complementary_conditions"

608 ],

609 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE[

610 "control_apriori_precision"

611 ],

612 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"],

613 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"],

614 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"],

615 ):

616 """

617 Parameters

618 ----------

619 minsupp_all: Tuple[float, float, float, float]

620 a minimum positive support of a contrast set (p/P). When multiple values

621 are specified, a metainduction is performed; Default and recommended

622 sequence is: 0.8, 0.5, 0.2, 0.1

623 max_neg2pos: float

624 a maximum ratio of negative to positive supports (nP/pN); Default is 0.5

625 max_passes_count: int

626 a maximum number of sequential covering passes for a single minsupp-all;

627 Default is 5

628 penalty_strength: float

629 (s) - penalty strength; Default is 0.5

630 penalty_saturation: float

631 the value of p_new / P at which penalty reward saturates; Default is 0.2.

632 minsupp_new : float = 5.0

633 a minimum number (or fraction, if value < 1.0) of previously uncovered

634 examples to be covered by a new rule (positive examples for classification

635 problems); default: 5,

636 induction_measure : :class:`rulekit.params.Measures` = \

637 :class:`rulekit.params.Measures.Correlation`

638 measure used during induction; default measure is correlation

639 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \

640 :class:`rulekit.params.Measures.Correlation`

641 measure used during pruning. Could be user defined (string), for example

642 :code:`2 * p / n`; default measure is correlation

643 voting_measure : :class:`rulekit.params.Measures` = \

644 :class:`rulekit.params.Measures.Correlation`

645 measure used during voting; default measure is correlation

646 max_growing : int = 0.0

647 non-negative integer representing maximum number of conditions which can be

648 added to the rule in the growing phase (use this parameter for large

649 datasets if execution time is prohibitive); 0 indicates no limit; default: 0

650 enable_pruning : bool = True

651 enable or disable pruning, default is True.

652 ignore_missing : bool = False

653 boolean telling whether missing values should be ignored (by default, a

654 missing value of given attribute is always considered as not fulfilling the

655 condition build upon that attribute); default: False.

656 max_uncovered_fraction : float = 0.0

657 Floating-point number from [0,1] interval representing maximum fraction of

658 examples that may remain uncovered by the rule set, default: 0.0.

659 select_best_candidate : bool = False

660 Flag determining if best candidate should be selected from growing phase;

661 default: False.

662 complementary_conditions : bool = False

663 If enabled, complementary conditions in the form a = !{value} for nominal

664 attributes are supported.

665 control_apriori_precision : bool = True

666 When inducing classification rules, verify if candidate precision is higher

667 than apriori precision of the investigated class.

668 max_rule_count : int = 0

669 Maximum number of rules to be generated (for classification data sets it

670 applies to a single class); 0 indicates no limit.

671 approximate_induction: bool = False

672 Use an approximate induction heuristic which does not check all possible

673 splits; note: this is an experimental feature and currently works only for

674 classification data sets, results may change in future;

675 approximate_bins_count: int = 100

676 maximum number of bins for an attribute evaluated in the approximate

677 induction.

678 """

679 BaseOperator.__init__(

680 self,

681 minsupp_all=minsupp_all,

682 max_neg2pos=max_neg2pos,

683 max_passes_count=max_passes_count,

684 penalty_strength=penalty_strength,

685 penalty_saturation=penalty_saturation,

686 minsupp_new=minsupp_new,

687 induction_measure=induction_measure,

688 pruning_measure=pruning_measure,

689 voting_measure=voting_measure,

690 max_growing=max_growing,

691 enable_pruning=enable_pruning,

692 ignore_missing=ignore_missing,

693 max_uncovered_fraction=max_uncovered_fraction,

694 select_best_candidate=select_best_candidate,

695 complementary_conditions=complementary_conditions,

696 control_apriori_precision=control_apriori_precision,

697 max_rule_count=max_rule_count,

698 approximate_induction=approximate_induction,

699 approximate_bins_count=approximate_bins_count,

700 )

701 BaseClassifier.__init__(self)

702 self.contrast_attribute: str = None

703 self._remap_to_numeric = False

704 self.label_unique_values = []

705 self.model: RuleSet[ClassificationRule] = None

706

707 def _map_result(self, predicted_example_set) -> np.ndarray:

708 prediction: np.ndarray

709 if self._remap_to_numeric:

710 prediction = PredictionResultMapper.map_to_numerical(predicted_example_set)

711 else:

712 prediction = PredictionResultMapper.map_to_nominal(predicted_example_set)

713 return prediction

714

715 def _get_unique_label_values(self, labels: Data):

716 tmp = {}

717 for label_value in labels:

718 tmp[label_value] = None

719 self.label_unique_values = list(tmp.keys())

720 if len(self.label_unique_values) > 0 and isinstance(

721 self.label_unique_values[0], bytes

722 ):

723 self.label_unique_values = [

724 item.decode("utf-8") for item in self.label_unique_values

725 ]

726

727 def fit(

728 self, values: Data, labels: Data, contrast_attribute: str

729 ) -> ContrastSetRuleClassifier: # pylint: disable=arguments-differ

730 """Train model on given dataset.

731

732 Parameters

733 ----------

734 values : :class:`rulekit.operator.Data`

735 attributes

736 labels : :class:`rulekit.operator.Data`

737 labels

738 contrast_attribute: str

739 group attribute

740 Returns

741 -------

742 self : ContrastSetRuleClassifier

743 """

744 RuleClassifier._get_unique_label_values( # pylint: disable=protected-access

745 self, labels

746 )

747 RuleClassifier._prepare_labels( # pylint: disable=protected-access,protected-access

748 self, labels

749 )

750 BaseOperator.fit(self, values, labels, contrast_attribute=contrast_attribute)

751 self.contrast_attribute = contrast_attribute

752 return self

753

754 def predict(

755 self, values: Data, return_metrics: bool = False

756 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]:

757 """Perform prediction and returns predicted labels.

758

759 Parameters

760 ----------

761 values : :class:`rulekit.operator.Data`

762 attributes

763

764 return_metrics: bool = False

765 Optional flag. If set to *True* method will calculate some additional model

766 metrics. Method will then return tuple instead of just predicted labels.

767

768 Returns

769 -------

770 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\

771 ClassificationPredictionMetrics`]]

772 If *return_metrics* flag wasn't set it will return just prediction,

773 otherwise a tuple will be returned with first element being prediction and

774 second one being metrics.

775 """

776 return RuleClassifier.predict(self, values, return_metrics)

777

778 def predict_proba(

779 self, values: Data, return_metrics: bool = False

780 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]:

781 """Perform prediction and returns class probabilities for each example.

782

783 Parameters

784 ----------

785 values : :class:`rulekit.operator.Data`

786 attributes

787

788 return_metrics: bool = False

789 Optional flag. If set to *True* method will calculate some additional model

790 metrics. Method will then return tuple instead of just probabilities.

791

792 Returns

793 -------

794 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\

795 ClassificationPredictionMetrics`]]

796 If *return_metrics* flag wasn't set it will return just probabilities

797 matrix, otherwise a tuple will be returned with first element being

798 prediction and second one being metrics.

799 """

800 return RuleClassifier.predict_proba(self, values, return_metrics)

801

802 def score(self, values: Data, labels: Data) -> float:

803 """Return the accuracy on the given test data and labels.

804

805 Parameters

806 ----------

807 values : :class:`rulekit.operator.Data`

808 attributes

809 labels : :class:`rulekit.operator.Data`

810 true labels

811

812 Returns

813 -------

814 score : float

815 Accuracy of self.predict(values) wrt. labels.

816 """

817 return RuleClassifier.score(self, values, labels)

818

819 def __getstate__(self) -> dict:

820 return {

821 **BaseOperator.__getstate__(self),

822 **{

823 "label_unique_values": self.label_unique_values,

824 "_remap_to_numeric": self._remap_to_numeric,

825 "contrast_attribute": self.contrast_attribute,

826 },

827 }

828

829 def __setstate__(self, state: dict):

830 BaseOperator.__setstate__(self, state)

831 self._init_classification_rule_performance_classes()

832 self.label_unique_values = state["label_unique_values"]

833 self._remap_to_numeric = state["_remap_to_numeric"]

834 self.contrast_attribute = state["contrast_attribute"]

835

836 def _get_problem_type(self) -> ProblemType:

837 return ProblemType.CONTRAST_CLASSIFICATION