Coverage for rulekit/classification.py: 81%

185 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-07 11:26 +0000

1"""Module contaiing classes for classification analysis and prediction. 

2""" 

3from __future__ import annotations 

4 

5from enum import Enum 

6from numbers import Number 

7from typing import Tuple 

8from typing import TypedDict 

9from typing import Union 

10 

11import numpy as np 

12import pandas as pd 

13from jpype import JClass 

14from jpype import JObject 

15from sklearn import metrics 

16 

17from rulekit._helpers import PredictionResultMapper 

18from rulekit._operator import BaseOperator 

19from rulekit._operator import Data 

20from rulekit._operator import ExpertKnowledgeOperator 

21from rulekit._problem_types import ProblemType 

22from rulekit.params import ContrastSetModelParams 

23from rulekit.params import DEFAULT_PARAMS_VALUE 

24from rulekit.params import ExpertModelParams 

25from rulekit.params import Measures 

26from rulekit.params import ModelsParams 

27from rulekit.rules import ClassificationRule 

28from rulekit.rules import RuleSet 

29 

30 

31class ClassificationPredictionMetrics(TypedDict): 

32 """Stores additional metrics for classification prediction. 

33 

34 Fields: 

35 * rules_per_example (float): Average number of rules per example. 

36 * voting_conflicts (_type_): Number of voting conflicts. 

37 """ 

38 

39 rules_per_example: float 

40 voting_conflicts: float 

41 

42 

43class _ClassificationParams(ModelsParams): 

44 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE["control_apriori_precision"] 

45 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"] 

46 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"] 

47 

48 

49class _ClassificationExpertParams(_ClassificationParams, ExpertModelParams): 

50 pass 

51 

52 

53class BaseClassifier: 

54 """:meta private:""" 

55 

56 def __init__(self): 

57 self._ClassificationRulesPerformance: JClass = ( 

58 None # pylint: disable=invalid-name 

59 ) 

60 self._NegativeVotingConflictsPerformance: JClass = ( 

61 None # pylint: disable=invalid-name 

62 ) 

63 self._init_classification_rule_performance_classes() 

64 

65 class MetricTypes(Enum): 

66 """:meta private:""" 

67 

68 RulesPerExample = 1 # pylint: disable=invalid-name 

69 VotingConflicts = 2 # pylint: disable=invalid-name 

70 NegativeVotingConflicts = 3 # pylint: disable=invalid-name 

71 

72 def _init_classification_rule_performance_classes(self): 

73 self._ClassificationRulesPerformance = JClass( # pylint: disable=invalid-name 

74 "adaa.analytics.rules.logic.performance.ClassificationRulesPerformance" 

75 ) 

76 

77 def _calculate_metric( 

78 self, example_set: JObject, metric_type: MetricTypes 

79 ) -> float: 

80 metric: JObject = self._ClassificationRulesPerformance(metric_type.value) 

81 metric_value = float(metric.countExample(example_set).getValue()) 

82 return metric_value 

83 

84 def _calculate_prediction_metrics( 

85 self, example_set 

86 ) -> ClassificationPredictionMetrics: 

87 return ClassificationPredictionMetrics( 

88 rules_per_example=self._calculate_metric( 

89 example_set, BaseClassifier.MetricTypes.RulesPerExample 

90 ), 

91 voting_conflicts=self._calculate_metric( 

92 example_set, BaseClassifier.MetricTypes.VotingConflicts 

93 ), 

94 ) 

95 

96 

97class RuleClassifier(BaseOperator, BaseClassifier): 

98 """Classification model.""" 

99 

100 __params_class__ = _ClassificationParams 

101 

102 def __init__( # pylint: disable=too-many-arguments,too-many-locals 

103 self, 

104 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"], 

105 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"], 

106 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"], 

107 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"], 

108 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"], 

109 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"], 

110 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"], 

111 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"], 

112 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"], 

113 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[ 

114 "complementary_conditions" 

115 ], 

116 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE[ 

117 "control_apriori_precision" 

118 ], 

119 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"], 

120 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"], 

121 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"], 

122 ): 

123 """ 

124 Parameters 

125 ---------- 

126 minsupp_new : float = 5.0 

127 a minimum number (or fraction, if value < 1.0) of previously uncovered 

128 examples to be covered by a new rule (positive examples for classification 

129 problems); default: 5, 

130 induction_measure : :class:`rulekit.params.Measures` = :class:`rulekit.params.\ 

131 Measures.Correlation` 

132 measure used during induction; default measure is correlation 

133 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \ 

134 :class:`rulekit.params.Measures.Correlation` 

135 measure used during pruning. Could be user defined (string), for example 

136 :code:`2 * p / n`; default measure is correlation 

137 voting_measure : :class:`rulekit.params.Measures` = \ 

138 :class:`rulekit.params.Measures.Correlation` 

139 measure used during voting; default measure is correlation 

140 max_growing : int = 0.0 

141 non-negative integer representing maximum number of conditions which can be 

142 added to the rule in the growing phase (use this parameter for large 

143 datasets if execution time is prohibitive); 0 indicates no limit; default: 0 

144 enable_pruning : bool = True 

145 enable or disable pruning, default is True. 

146 ignore_missing : bool = False 

147 boolean telling whether missing values should be ignored (by default, a 

148 missing valueof given attribute is always cconsidered as not fulfilling the 

149 condition build upon that attribute); default: False. 

150 max_uncovered_fraction : float = 0.0 

151 Floating-point number from [0,1] interval representing maximum fraction of 

152 examples that may remain uncovered by the rule set, default: 0.0. 

153 select_best_candidate : bool = False 

154 Flag determining if best candidate should be selected from growing phase; 

155 default: False. 

156 complementary_conditions : bool = False 

157 If enabled, complementary conditions in the form a = !{value} for nominal 

158 attributes are supported. 

159 control_apriori_precision : bool = True 

160 When inducing classification rules, verify if candidate precision is higher 

161 than apriori precision of the investigated class. 

162 max_rule_count : int = 0 

163 Maximum number of rules to be generated (for classification data sets it 

164 applies to a single class); 0 indicates no limit. 

165 approximate_induction: bool = False 

166 Use an approximate induction heuristic which does not check all possible 

167 splits; note: this is an experimental feature and currently works only for 

168 classification data sets, results may change in future; 

169 approximate_bins_count: int = 100 

170 maximum number of bins for an attribute evaluated in the approximate 

171 induction. 

172 """ 

173 BaseOperator.__init__( 

174 self, 

175 minsupp_new=minsupp_new, 

176 induction_measure=induction_measure, 

177 pruning_measure=pruning_measure, 

178 voting_measure=voting_measure, 

179 max_growing=max_growing, 

180 enable_pruning=enable_pruning, 

181 ignore_missing=ignore_missing, 

182 max_uncovered_fraction=max_uncovered_fraction, 

183 select_best_candidate=select_best_candidate, 

184 complementary_conditions=complementary_conditions, 

185 control_apriori_precision=control_apriori_precision, 

186 max_rule_count=max_rule_count, 

187 approximate_induction=approximate_induction, 

188 approximate_bins_count=approximate_bins_count, 

189 ) 

190 BaseClassifier.__init__(self) 

191 self._remap_to_numeric = False 

192 self.label_unique_values = [] 

193 self.model: RuleSet[ClassificationRule] = None 

194 

195 def _map_result(self, predicted_example_set) -> np.ndarray: 

196 prediction: np.ndarray 

197 if self._remap_to_numeric: 

198 prediction = PredictionResultMapper.map_to_numerical(predicted_example_set) 

199 else: 

200 prediction = PredictionResultMapper.map_to_nominal(predicted_example_set) 

201 return prediction 

202 

203 def _map_confidence(self, predicted_example_set) -> np.ndarray: 

204 return PredictionResultMapper.map_confidence( 

205 predicted_example_set, self.label_unique_values 

206 ) 

207 

208 def _get_unique_label_values(self, labels: Data): 

209 tmp = {} 

210 for label_value in labels: 

211 tmp[label_value] = None 

212 self.label_unique_values = list(tmp.keys()) 

213 if len(self.label_unique_values) > 0 and isinstance( 

214 self.label_unique_values[0], bytes 

215 ): 

216 self.label_unique_values = [ 

217 item.decode("utf-8") for item in self.label_unique_values 

218 ] 

219 

220 def _prepare_labels(self, labels: Data) -> Data: 

221 if isinstance(labels, (pd.DataFrame, pd.Series)): 

222 if labels.dtypes.name == "bool": 

223 return labels.astype(str) 

224 if isinstance(labels.iloc[0], Number): 

225 self._remap_to_numeric = True 

226 return labels.astype(str) 

227 else: 

228 if isinstance(labels[0], bool) or ( 

229 isinstance(labels, np.ndarray) and labels.dtype.name == "bool" 

230 ): 

231 return np.array(list(map(str, labels))) 

232 if isinstance(labels[0], Number): 

233 self._remap_to_numeric = True 

234 return np.array(list(map(str, labels))) 

235 return labels 

236 

237 def fit( 

238 self, values: Data, labels: Data 

239 ) -> RuleClassifier: # pylint: disable=arguments-differ 

240 """Train model on given dataset. 

241 

242 Parameters 

243 ---------- 

244 values : :class:`rulekit.operator.Data` 

245 attributes 

246 labels : :class:`rulekit.operator.Data` 

247 labels 

248 Returns 

249 ------- 

250 self : RuleClassifier 

251 """ 

252 self._get_unique_label_values(labels) 

253 labels = self._prepare_labels(labels) 

254 BaseOperator.fit(self, values, labels) 

255 return self 

256 

257 def predict( 

258 self, values: Data, return_metrics: bool = False 

259 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]: 

260 """Perform prediction and returns predicted labels. 

261 

262 Parameters 

263 ---------- 

264 values : :class:`rulekit.operator.Data` 

265 attributes 

266 

267 return_metrics: bool = False 

268 Optional flag. If set to *True* method will calculate some additional model 

269 metrics. Method will then return tuple instead of just predicted labels. 

270 

271 Returns 

272 ------- 

273 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\ 

274 ClassificationPredictionMetrics`]] 

275 If *return_metrics* flag wasn't set it will return just prediction, 

276 otherwise a tuple will be returned with first element being prediction and 

277 second one being metrics. 

278 """ 

279 result_example_set = BaseOperator.predict(self, values) 

280 y_pred = self._map_result(result_example_set) 

281 if return_metrics: 

282 metrics_values: dict = BaseClassifier._calculate_prediction_metrics( 

283 self, result_example_set 

284 ) 

285 return (y_pred, metrics_values) 

286 return y_pred 

287 

288 def predict_proba( 

289 self, values: Data, return_metrics: bool = False 

290 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]: 

291 """Perform prediction and returns class probabilities for each example. 

292 

293 Parameters 

294 ---------- 

295 values : :class:`rulekit.operator.Data` 

296 attributes 

297 

298 return_metrics: bool = False 

299 Optional flag. If set to *True* method will calculate some additional model 

300 metrics. Method will then return tuple instead of just probabilities. 

301 

302 Returns 

303 ------- 

304 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\ 

305 ClassificationPredictionMetrics`]] 

306 If *return_metrics* flag wasn't set it will return just probabilities 

307 matrix, otherwise a tuple will be returned with first element being 

308 prediction and second one being metrics. 

309 """ 

310 result_example_set = BaseOperator.predict(self, values) 

311 mapped_result_example_set = self._map_confidence(result_example_set) 

312 if return_metrics: 

313 metrics_values: dict = BaseClassifier._calculate_prediction_metrics( 

314 self, result_example_set 

315 ) 

316 return (mapped_result_example_set, metrics_values) 

317 return mapped_result_example_set 

318 

319 def score(self, values: Data, labels: Data) -> float: 

320 """Return the accuracy on the given test data and labels. 

321 

322 Parameters 

323 ---------- 

324 values : :class:`rulekit.operator.Data` 

325 attributes 

326 labels : :class:`rulekit.operator.Data` 

327 true labels 

328 

329 Returns 

330 ------- 

331 score : float 

332 Accuracy of self.predict(values) wrt. labels. 

333 """ 

334 predicted_labels = self.predict(values) 

335 return metrics.accuracy_score(labels, predicted_labels) 

336 

337 def __getstate__(self) -> dict: 

338 return { 

339 **BaseOperator.__getstate__(self), 

340 **{ 

341 "label_unique_values": self.label_unique_values, 

342 "_remap_to_numeric": self._remap_to_numeric, 

343 }, 

344 } 

345 

346 def __setstate__(self, state: dict): 

347 BaseOperator.__setstate__(self, state) 

348 self._init_classification_rule_performance_classes() 

349 self.label_unique_values = state["label_unique_values"] 

350 self._remap_to_numeric = state["_remap_to_numeric"] 

351 

352 def _get_problem_type(self) -> ProblemType: 

353 return ProblemType.CLASSIFICATION 

354 

355 

356class ExpertRuleClassifier(ExpertKnowledgeOperator, RuleClassifier): 

357 """Classification model using expert knowledge.""" 

358 

359 __params_class__ = _ClassificationExpertParams 

360 

361 def __init__( # pylint: disable=too-many-arguments,too-many-locals 

362 self, 

363 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"], 

364 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"], 

365 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"], 

366 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"], 

367 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"], 

368 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"], 

369 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"], 

370 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"], 

371 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"], 

372 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[ 

373 "complementary_conditions" 

374 ], 

375 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE[ 

376 "control_apriori_precision" 

377 ], 

378 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"], 

379 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"], 

380 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"], 

381 extend_using_preferred: bool = DEFAULT_PARAMS_VALUE["extend_using_preferred"], 

382 extend_using_automatic: bool = DEFAULT_PARAMS_VALUE["extend_using_automatic"], 

383 induce_using_preferred: bool = DEFAULT_PARAMS_VALUE["induce_using_preferred"], 

384 induce_using_automatic: bool = DEFAULT_PARAMS_VALUE["induce_using_automatic"], 

385 consider_other_classes: bool = DEFAULT_PARAMS_VALUE["consider_other_classes"], 

386 preferred_conditions_per_rule: int = DEFAULT_PARAMS_VALUE[ 

387 "preferred_conditions_per_rule" 

388 ], 

389 preferred_attributes_per_rule: int = DEFAULT_PARAMS_VALUE[ 

390 "preferred_attributes_per_rule" 

391 ], 

392 ): 

393 """ 

394 Parameters 

395 ---------- 

396 minsupp_new : float = 5.0 

397 a minimum number (or fraction, if value < 1.0) of previously uncovered examples 

398 to be covered by a new rule (positive examples for classification problems); 

399 default: 5, 

400 

401 induction_measure : :class:`rulekit.params.Measures` = \ 

402 :class:`rulekit.params.Measures.Correlation` 

403 measure used during induction; default measure is correlation 

404 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \ 

405 :class:`rulekit.params.Measures.Correlation` 

406 measure used during pruning. Could be user defined (string), for example 

407 :code:`2 * p / n`; default measure is correlation 

408 voting_measure : :class:`rulekit.params.Measures` = \ 

409 :class:`rulekit.params.Measures.Correlation` 

410 measure used during voting; default measure is correlation 

411 max_growing : int = 0.0 

412 non-negative integer representing maximum number of conditions which can be 

413 added to the rule in the growing phase (use this parameter for large 

414 datasets if execution time is prohibitive); 0 indicates no limit; default: 0 

415 enable_pruning : bool = True 

416 enable or disable pruning, default is True. 

417 ignore_missing : bool = False 

418 boolean telling whether missing values should be ignored (by default, a 

419 missing value of given attribute is always considered as not fulfilling the 

420 condition build upon that attribute); default: False. 

421 max_uncovered_fraction : float = 0.0 

422 Floating-point number from [0,1] interval representing maximum fraction of 

423 examples that may remain uncovered by the rule set, default: 0.0. 

424 select_best_candidate : bool = False 

425 Flag determining if best candidate should be selected from growing phase; 

426 default: False. 

427 complementary_conditions : bool = False 

428 If enabled, complementary conditions in the form a = !{value} for nominal 

429 attributes 

430 are supported. 

431 control_apriori_precision : bool = True 

432 When inducing classification rules, verify if candidate precision is higher 

433 than apriori precision of the investigated class. 

434 max_rule_count : int = 0 

435 Maximum number of rules to be generated (for classification data sets it 

436 applies to a single class); 0 indicates no limit. 

437 approximate_induction: bool = False 

438 Use an approximate induction heuristic which does not check all possible 

439 splits; note: this is an experimental feature and currently works only for 

440 classification data sets, results may change in future; 

441 approximate_bins_count: int = 100 

442 maximum number of bins for an attribute evaluated in the approximate 

443 induction. 

444 

445 extend_using_preferred : bool = False 

446 boolean indicating whether initial rules should be extended with a use of 

447 preferred conditions and attributes; default is False 

448 extend_using_automatic : bool = False 

449 boolean indicating whether initial rules should be extended with a use of 

450 automatic conditions and attributes; default is False 

451 induce_using_preferred : bool = False 

452 boolean indicating whether new rules should be induced with a use of 

453 preferred conditions and attributes; default is False 

454 induce_using_automatic : bool = False 

455 boolean indicating whether new rules should be induced with a use of 

456 automatic conditions and attributes; default is False 

457 consider_other_classes : bool = False 

458 boolean indicating whether automatic induction should be performed for 

459 classes for which no user's knowledge has been defined 

460 (classification only); default is False. 

461 preferred_conditions_per_rule : int = None 

462 maximum number of preferred conditions per rule; default: unlimited, 

463 preferred_attributes_per_rule : int = None 

464 maximum number of preferred attributes per rule; default: unlimited. 

465 """ 

466 self._remap_to_numeric = False 

467 RuleClassifier.__init__( 

468 self, 

469 minsupp_new=minsupp_new, 

470 induction_measure=induction_measure, 

471 pruning_measure=pruning_measure, 

472 voting_measure=voting_measure, 

473 max_growing=max_growing, 

474 enable_pruning=enable_pruning, 

475 ignore_missing=ignore_missing, 

476 max_uncovered_fraction=max_uncovered_fraction, 

477 select_best_candidate=select_best_candidate, 

478 complementary_conditions=complementary_conditions, 

479 control_apriori_precision=control_apriori_precision, 

480 max_rule_count=max_rule_count, 

481 approximate_induction=approximate_induction, 

482 approximate_bins_count=approximate_bins_count, 

483 ) 

484 ExpertKnowledgeOperator.__init__( 

485 self, 

486 minsupp_new=minsupp_new, 

487 induction_measure=induction_measure, 

488 pruning_measure=pruning_measure, 

489 voting_measure=voting_measure, 

490 max_growing=max_growing, 

491 enable_pruning=enable_pruning, 

492 ignore_missing=ignore_missing, 

493 max_uncovered_fraction=max_uncovered_fraction, 

494 select_best_candidate=select_best_candidate, 

495 complementary_conditions=complementary_conditions, 

496 extend_using_preferred=extend_using_preferred, 

497 extend_using_automatic=extend_using_automatic, 

498 induce_using_preferred=induce_using_preferred, 

499 induce_using_automatic=induce_using_automatic, 

500 consider_other_classes=consider_other_classes, 

501 preferred_conditions_per_rule=preferred_conditions_per_rule, 

502 preferred_attributes_per_rule=preferred_attributes_per_rule, 

503 control_apriori_precision=control_apriori_precision, 

504 max_rule_count=max_rule_count, 

505 approximate_induction=approximate_induction, 

506 approximate_bins_count=approximate_bins_count, 

507 ) 

508 self.model: RuleSet[ClassificationRule] = None 

509 

510 def fit( # pylint: disable=arguments-differ,too-many-arguments 

511 self, 

512 values: Data, 

513 labels: Data, 

514 expert_rules: list[Union[str, tuple[str, str]]] = None, 

515 expert_preferred_conditions: list[Union[str, tuple[str, str]]] = None, 

516 expert_forbidden_conditions: list[Union[str, tuple[str, str]]] = None, 

517 ) -> ExpertRuleClassifier: 

518 """Train model on given dataset. 

519 

520 Parameters 

521 ---------- 

522 values : :class:`rulekit.operator.Data` 

523 attributes 

524 labels : :class:`rulekit.operator.Data` 

525 labels 

526 

527 expert_rules : List[Union[str, Tuple[str, str]]] 

528 set of initial rules, either passed as a list of strings representing rules 

529 or as list of tuples where first element is name of the rule and second one 

530 is rule string. 

531 expert_preferred_conditions : List[Union[str, Tuple[str, str]]] 

532 multiset of preferred conditions (used also for specifying preferred 

533 attributes by using special value Any). Either passed as a list of strings 

534 representing rules or as list of tuples where first element is name of the 

535 rule and second one is rule string. 

536 expert_forbidden_conditions : List[Union[str, Tuple[str, str]]] 

537 set of forbidden conditions (used also for specifying forbidden attributes 

538 by using special value Any). Either passed as a list of strings representing 

539 rules or as list of tuples where first element is name of the rule and 

540 second one is rule string. 

541 Returns 

542 ------- 

543 self : ExpertRuleClassifier 

544 """ 

545 if isinstance(labels, (pd.DataFrame, pd.Series)): 

546 if isinstance(labels.iloc[0], Number): 

547 self._remap_to_numeric = True 

548 labels = labels.astype(str) 

549 else: 

550 if isinstance(labels[0], Number): 

551 self._remap_to_numeric = True 

552 labels = list(map(str, labels)) 

553 self._get_unique_label_values(labels) 

554 self._prepare_labels(labels) 

555 return ExpertKnowledgeOperator.fit( 

556 self, 

557 values, 

558 labels, 

559 expert_rules=expert_rules, 

560 expert_preferred_conditions=expert_preferred_conditions, 

561 expert_forbidden_conditions=expert_forbidden_conditions, 

562 ) 

563 

564 def predict( 

565 self, values: Data, return_metrics: bool = False 

566 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]: 

567 return RuleClassifier.predict(self, values, return_metrics) 

568 

569 def __getstate__(self) -> dict: 

570 return { 

571 **BaseOperator.__getstate__(self), 

572 **{"_remap_to_numeric": self._remap_to_numeric}, 

573 } 

574 

575 def __setstate__(self, state: dict): 

576 BaseOperator.__setstate__(self, state) 

577 self._remap_to_numeric = state["_remap_to_numeric"] 

578 

579 def _get_problem_type(self) -> ProblemType: 

580 return ProblemType.CLASSIFICATION 

581 

582 

583class ContrastSetRuleClassifier(BaseOperator, BaseClassifier): 

584 """Contrast set classification model.""" 

585 

586 __params_class__ = ContrastSetModelParams 

587 

588 def __init__( # pylint: disable=too-many-arguments,too-many-locals 

589 self, 

590 minsupp_all: Tuple[float, float, float, float] = DEFAULT_PARAMS_VALUE[ 

591 "minsupp_all" 

592 ], 

593 max_neg2pos: float = DEFAULT_PARAMS_VALUE["max_neg2pos"], 

594 max_passes_count: int = DEFAULT_PARAMS_VALUE["max_passes_count"], 

595 penalty_strength: float = DEFAULT_PARAMS_VALUE["penalty_strength"], 

596 penalty_saturation: float = DEFAULT_PARAMS_VALUE["penalty_saturation"], 

597 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"], 

598 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"], 

599 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"], 

600 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"], 

601 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"], 

602 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"], 

603 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"], 

604 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"], 

605 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"], 

606 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[ 

607 "complementary_conditions" 

608 ], 

609 control_apriori_precision: bool = DEFAULT_PARAMS_VALUE[ 

610 "control_apriori_precision" 

611 ], 

612 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"], 

613 approximate_induction: bool = DEFAULT_PARAMS_VALUE["approximate_induction"], 

614 approximate_bins_count: int = DEFAULT_PARAMS_VALUE["approximate_bins_count"], 

615 ): 

616 """ 

617 Parameters 

618 ---------- 

619 minsupp_all: Tuple[float, float, float, float] 

620 a minimum positive support of a contrast set (p/P). When multiple values 

621 are specified, a metainduction is performed; Default and recommended 

622 sequence is: 0.8, 0.5, 0.2, 0.1 

623 max_neg2pos: float 

624 a maximum ratio of negative to positive supports (nP/pN); Default is 0.5 

625 max_passes_count: int 

626 a maximum number of sequential covering passes for a single minsupp-all; 

627 Default is 5 

628 penalty_strength: float 

629 (s) - penalty strength; Default is 0.5 

630 penalty_saturation: float 

631 the value of p_new / P at which penalty reward saturates; Default is 0.2. 

632 minsupp_new : float = 5.0 

633 a minimum number (or fraction, if value < 1.0) of previously uncovered 

634 examples to be covered by a new rule (positive examples for classification 

635 problems); default: 5, 

636 induction_measure : :class:`rulekit.params.Measures` = \ 

637 :class:`rulekit.params.Measures.Correlation` 

638 measure used during induction; default measure is correlation 

639 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \ 

640 :class:`rulekit.params.Measures.Correlation` 

641 measure used during pruning. Could be user defined (string), for example 

642 :code:`2 * p / n`; default measure is correlation 

643 voting_measure : :class:`rulekit.params.Measures` = \ 

644 :class:`rulekit.params.Measures.Correlation` 

645 measure used during voting; default measure is correlation 

646 max_growing : int = 0.0 

647 non-negative integer representing maximum number of conditions which can be 

648 added to the rule in the growing phase (use this parameter for large 

649 datasets if execution time is prohibitive); 0 indicates no limit; default: 0 

650 enable_pruning : bool = True 

651 enable or disable pruning, default is True. 

652 ignore_missing : bool = False 

653 boolean telling whether missing values should be ignored (by default, a 

654 missing value of given attribute is always considered as not fulfilling the 

655 condition build upon that attribute); default: False. 

656 max_uncovered_fraction : float = 0.0 

657 Floating-point number from [0,1] interval representing maximum fraction of 

658 examples that may remain uncovered by the rule set, default: 0.0. 

659 select_best_candidate : bool = False 

660 Flag determining if best candidate should be selected from growing phase; 

661 default: False. 

662 complementary_conditions : bool = False 

663 If enabled, complementary conditions in the form a = !{value} for nominal 

664 attributes are supported. 

665 control_apriori_precision : bool = True 

666 When inducing classification rules, verify if candidate precision is higher 

667 than apriori precision of the investigated class. 

668 max_rule_count : int = 0 

669 Maximum number of rules to be generated (for classification data sets it 

670 applies to a single class); 0 indicates no limit. 

671 approximate_induction: bool = False 

672 Use an approximate induction heuristic which does not check all possible 

673 splits; note: this is an experimental feature and currently works only for 

674 classification data sets, results may change in future; 

675 approximate_bins_count: int = 100 

676 maximum number of bins for an attribute evaluated in the approximate 

677 induction. 

678 """ 

679 BaseOperator.__init__( 

680 self, 

681 minsupp_all=minsupp_all, 

682 max_neg2pos=max_neg2pos, 

683 max_passes_count=max_passes_count, 

684 penalty_strength=penalty_strength, 

685 penalty_saturation=penalty_saturation, 

686 minsupp_new=minsupp_new, 

687 induction_measure=induction_measure, 

688 pruning_measure=pruning_measure, 

689 voting_measure=voting_measure, 

690 max_growing=max_growing, 

691 enable_pruning=enable_pruning, 

692 ignore_missing=ignore_missing, 

693 max_uncovered_fraction=max_uncovered_fraction, 

694 select_best_candidate=select_best_candidate, 

695 complementary_conditions=complementary_conditions, 

696 control_apriori_precision=control_apriori_precision, 

697 max_rule_count=max_rule_count, 

698 approximate_induction=approximate_induction, 

699 approximate_bins_count=approximate_bins_count, 

700 ) 

701 BaseClassifier.__init__(self) 

702 self.contrast_attribute: str = None 

703 self._remap_to_numeric = False 

704 self.label_unique_values = [] 

705 self.model: RuleSet[ClassificationRule] = None 

706 

707 def _map_result(self, predicted_example_set) -> np.ndarray: 

708 prediction: np.ndarray 

709 if self._remap_to_numeric: 

710 prediction = PredictionResultMapper.map_to_numerical(predicted_example_set) 

711 else: 

712 prediction = PredictionResultMapper.map_to_nominal(predicted_example_set) 

713 return prediction 

714 

715 def _get_unique_label_values(self, labels: Data): 

716 tmp = {} 

717 for label_value in labels: 

718 tmp[label_value] = None 

719 self.label_unique_values = list(tmp.keys()) 

720 if len(self.label_unique_values) > 0 and isinstance( 

721 self.label_unique_values[0], bytes 

722 ): 

723 self.label_unique_values = [ 

724 item.decode("utf-8") for item in self.label_unique_values 

725 ] 

726 

727 def fit( 

728 self, values: Data, labels: Data, contrast_attribute: str 

729 ) -> ContrastSetRuleClassifier: # pylint: disable=arguments-differ 

730 """Train model on given dataset. 

731 

732 Parameters 

733 ---------- 

734 values : :class:`rulekit.operator.Data` 

735 attributes 

736 labels : :class:`rulekit.operator.Data` 

737 labels 

738 contrast_attribute: str 

739 group attribute 

740 Returns 

741 ------- 

742 self : ContrastSetRuleClassifier 

743 """ 

744 RuleClassifier._get_unique_label_values( # pylint: disable=protected-access 

745 self, labels 

746 ) 

747 RuleClassifier._prepare_labels( # pylint: disable=protected-access,protected-access 

748 self, labels 

749 ) 

750 BaseOperator.fit(self, values, labels, contrast_attribute=contrast_attribute) 

751 self.contrast_attribute = contrast_attribute 

752 return self 

753 

754 def predict( 

755 self, values: Data, return_metrics: bool = False 

756 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]: 

757 """Perform prediction and returns predicted labels. 

758 

759 Parameters 

760 ---------- 

761 values : :class:`rulekit.operator.Data` 

762 attributes 

763 

764 return_metrics: bool = False 

765 Optional flag. If set to *True* method will calculate some additional model 

766 metrics. Method will then return tuple instead of just predicted labels. 

767 

768 Returns 

769 ------- 

770 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\ 

771 ClassificationPredictionMetrics`]] 

772 If *return_metrics* flag wasn't set it will return just prediction, 

773 otherwise a tuple will be returned with first element being prediction and 

774 second one being metrics. 

775 """ 

776 return RuleClassifier.predict(self, values, return_metrics) 

777 

778 def predict_proba( 

779 self, values: Data, return_metrics: bool = False 

780 ) -> Union[np.ndarray, tuple[np.ndarray, ClassificationPredictionMetrics]]: 

781 """Perform prediction and returns class probabilities for each example. 

782 

783 Parameters 

784 ---------- 

785 values : :class:`rulekit.operator.Data` 

786 attributes 

787 

788 return_metrics: bool = False 

789 Optional flag. If set to *True* method will calculate some additional model 

790 metrics. Method will then return tuple instead of just probabilities. 

791 

792 Returns 

793 ------- 

794 result : Union[np.ndarray, tuple[np.ndarray, :class:`rulekit.classification.\ 

795 ClassificationPredictionMetrics`]] 

796 If *return_metrics* flag wasn't set it will return just probabilities 

797 matrix, otherwise a tuple will be returned with first element being 

798 prediction and second one being metrics. 

799 """ 

800 return RuleClassifier.predict_proba(self, values, return_metrics) 

801 

802 def score(self, values: Data, labels: Data) -> float: 

803 """Return the accuracy on the given test data and labels. 

804 

805 Parameters 

806 ---------- 

807 values : :class:`rulekit.operator.Data` 

808 attributes 

809 labels : :class:`rulekit.operator.Data` 

810 true labels 

811 

812 Returns 

813 ------- 

814 score : float 

815 Accuracy of self.predict(values) wrt. labels. 

816 """ 

817 return RuleClassifier.score(self, values, labels) 

818 

819 def __getstate__(self) -> dict: 

820 return { 

821 **BaseOperator.__getstate__(self), 

822 **{ 

823 "label_unique_values": self.label_unique_values, 

824 "_remap_to_numeric": self._remap_to_numeric, 

825 "contrast_attribute": self.contrast_attribute, 

826 }, 

827 } 

828 

829 def __setstate__(self, state: dict): 

830 BaseOperator.__setstate__(self, state) 

831 self._init_classification_rule_performance_classes() 

832 self.label_unique_values = state["label_unique_values"] 

833 self._remap_to_numeric = state["_remap_to_numeric"] 

834 self.contrast_attribute = state["contrast_attribute"] 

835 

836 def _get_problem_type(self) -> ProblemType: 

837 return ProblemType.CONTRAST_CLASSIFICATION