Coverage for rulekit/regression.py: 80%

82 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-07 11:26 +0000

1"""Module containing classes for regression analysis and prediction. 

2""" 

3from __future__ import annotations 

4 

5from numbers import Number 

6from typing import Tuple 

7from typing import Union 

8 

9import numpy as np 

10import pandas as pd 

11from sklearn import metrics 

12 

13from rulekit._helpers import PredictionResultMapper 

14from rulekit._operator import BaseOperator 

15from rulekit._operator import Data 

16from rulekit._operator import ExpertKnowledgeOperator 

17from rulekit._problem_types import ProblemType 

18from rulekit.params import ContrastSetModelParams 

19from rulekit.params import DEFAULT_PARAMS_VALUE 

20from rulekit.params import ExpertModelParams 

21from rulekit.params import Measures 

22from rulekit.params import ModelsParams 

23from rulekit.rules import RegressionRule 

24from rulekit.rules import RuleSet 

25 

26 

27class _RegressionModelParams(ModelsParams): 

28 mean_based_regression: bool = DEFAULT_PARAMS_VALUE["mean_based_regression"] 

29 

30 

31class _RegressionExpertModelParams(_RegressionModelParams, ExpertModelParams): 

32 pass 

33 

34 

35class RuleRegressor(BaseOperator): 

36 """Regression model.""" 

37 

38 __params_class__ = _RegressionModelParams 

39 

40 def __init__( # pylint: disable=too-many-arguments 

41 self, 

42 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"], 

43 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"], 

44 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"], 

45 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"], 

46 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"], 

47 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"], 

48 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"], 

49 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"], 

50 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"], 

51 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[ 

52 "complementary_conditions" 

53 ], 

54 mean_based_regression: bool = DEFAULT_PARAMS_VALUE["mean_based_regression"], 

55 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"], 

56 ): 

57 """ 

58 Parameters 

59 ---------- 

60 minsupp_new : float = 5.0 

61 a minimum number (or fraction, if value < 1.0) of previously uncovered 

62 examples to be covered by a new rule (positive examples for classification 

63 problems); default: 5, 

64 induction_measure : :class:`rulekit.params.Measures` = \ 

65 :class:`rulekit.params.Measures.Correlation` 

66 measure used during induction; default measure is correlation 

67 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \ 

68 :class:`rulekit.params.Measures.Correlation` 

69 measure used during pruning. Could be user defined (string), for example 

70 :code:`2 * p / n`; default measure is correlation 

71 voting_measure : :class:`rulekit.params.Measures` = \ 

72 :class:`rulekit.params.Measures.Correlation` 

73 measure used during voting; default measure is correlation 

74 max_growing : int = 0.0 

75 non-negative integer representing maximum number of conditions which can be 

76 added to the rule in the growing phase (use this parameter for large 

77 datasets if execution time is prohibitive); 0 indicates no limit; default: 0 

78 enable_pruning : bool = True 

79 enable or disable pruning, default is True. 

80 ignore_missing : bool = False 

81 boolean telling whether missing values should be ignored (by default, a 

82 missing value of given attribute is always considered as not fulfilling the 

83 condition build upon that attribute); default: False. 

84 max_uncovered_fraction : float = 0.0 

85 Floating-point number from [0,1] interval representing maximum fraction of 

86 examples that may remain uncovered by the rule set, default: 0.0. 

87 select_best_candidate : bool = False 

88 Flag determining if best candidate should be selected from growing phase; 

89 default: False. 

90 complementary_conditions : bool = False 

91 If enabled, complementary conditions in the form a = !{value} for nominal 

92 attributes are supported. 

93 mean_based_regression : bool = True 

94 Enable fast induction of mean-based regression rules instead of default 

95 median-based. 

96 max_rule_count : int = 0 

97 Maximum number of rules to be generated; 0 indicates no limit. Default: 0 

98 """ 

99 super().__init__( 

100 minsupp_new=minsupp_new, 

101 induction_measure=induction_measure, 

102 pruning_measure=pruning_measure, 

103 voting_measure=voting_measure, 

104 max_growing=max_growing, 

105 enable_pruning=enable_pruning, 

106 ignore_missing=ignore_missing, 

107 max_uncovered_fraction=max_uncovered_fraction, 

108 select_best_candidate=select_best_candidate, 

109 complementary_conditions=complementary_conditions, 

110 mean_based_regression=mean_based_regression, 

111 max_rule_count=max_rule_count, 

112 ) 

113 self.model: RuleSet[RegressionRule] = None 

114 

115 def _validate_labels(self, labels: Data): 

116 if isinstance(labels, (pd.DataFrame, pd.Series)): 

117 first_label = labels.iloc[0] 

118 else: 

119 first_label = labels[0] 

120 if not isinstance(first_label, Number): 

121 raise ValueError( 

122 f"{self.__class__.__name__} requires lables values to be numeric" 

123 ) 

124 

125 def fit( # pylint: disable=arguments-differ 

126 self, values: Data, labels: Data 

127 ) -> RuleRegressor: 

128 """Train model on given dataset. 

129 

130 Parameters 

131 ---------- 

132 values : :class:`rulekit.operator.Data` 

133 attributes 

134 labels : :class:`rulekit.operator.Data` 

135 target values 

136 Returns 

137 ------- 

138 self : RuleRegressor 

139 """ 

140 self._validate_labels(labels) 

141 super().fit(values, labels) 

142 return self 

143 

144 def predict(self, values: Data) -> np.ndarray: 

145 """Perform prediction and returns predicted values. 

146 

147 Parameters 

148 ---------- 

149 values : :class:`rulekit.operator.Data` 

150 attributes 

151 

152 Returns 

153 ------- 

154 result : np.ndarray 

155 predicted values 

156 """ 

157 return self._map_result(super().predict(values)) 

158 

159 def score(self, values: Data, labels: Data) -> float: 

160 """Return the coefficient of determination R2 of the prediction 

161 

162 Parameters 

163 ---------- 

164 values : :class:`rulekit.operator.Data` 

165 attributes 

166 labels : :class:`rulekit.operator.Data` 

167 true target values 

168 

169 Returns 

170 ------- 

171 score : float 

172 R2 of self.predict(values) wrt. labels. 

173 """ 

174 predicted_labels = self.predict(values) 

175 return metrics.r2_score(labels, predicted_labels) 

176 

177 def _map_result(self, predicted_example_set) -> np.ndarray: 

178 return PredictionResultMapper.map_to_numerical( 

179 predicted_example_set, remap=False 

180 ) 

181 

182 def _get_problem_type(self) -> ProblemType: 

183 return ProblemType.REGRESSION 

184 

185 

186class ExpertRuleRegressor(ExpertKnowledgeOperator, RuleRegressor): 

187 """Expert Regression model.""" 

188 

189 __params_class__ = _RegressionExpertModelParams 

190 

191 def __init__( # pylint: disable=too-many-arguments,too-many-locals 

192 self, 

193 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"], 

194 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"], 

195 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"], 

196 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"], 

197 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"], 

198 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"], 

199 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"], 

200 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"], 

201 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"], 

202 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[ 

203 "complementary_conditions" 

204 ], 

205 mean_based_regression: bool = DEFAULT_PARAMS_VALUE["mean_based_regression"], 

206 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"], 

207 extend_using_preferred: bool = DEFAULT_PARAMS_VALUE["extend_using_preferred"], 

208 extend_using_automatic: bool = DEFAULT_PARAMS_VALUE["extend_using_automatic"], 

209 induce_using_preferred: bool = DEFAULT_PARAMS_VALUE["induce_using_preferred"], 

210 induce_using_automatic: bool = DEFAULT_PARAMS_VALUE["induce_using_automatic"], 

211 preferred_conditions_per_rule: int = DEFAULT_PARAMS_VALUE[ 

212 "preferred_conditions_per_rule" 

213 ], 

214 preferred_attributes_per_rule: int = DEFAULT_PARAMS_VALUE[ 

215 "preferred_attributes_per_rule" 

216 ], 

217 ): 

218 """ 

219 Parameters 

220 ---------- 

221 minsupp_new : float = 5.0 

222 a minimum number (or fraction, if value < 1.0) of previously uncovered 

223 examples to be covered by a new rule (positive examples for classification 

224 problems); default: 5, 

225 induction_measure : :class:`rulekit.params.Measures` = \ 

226 :class:`rulekit.params.Measures.Correlation` 

227 measure used during induction; default measure is correlation 

228 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \ 

229 :class:`rulekit.params.Measures.Correlation` 

230 measure used during pruning. Could be user defined (string), for example 

231 :code:`2 * p / n`; default measure is correlation 

232 voting_measure : :class:`rulekit.params.Measures` = \ 

233 :class:`rulekit.params.Measures.Correlation` 

234 measure used during voting; default measure is correlation 

235 max_growing : int = 0.0 

236 non-negative integer representing maximum number of conditions which can be 

237 added to the rule in the growing phase (use this parameter for large 

238 datasets if execution time is prohibitive); 0 indicates no limit; default: 0, 

239 enable_pruning : bool = True 

240 enable or disable pruning, default is True. 

241 ignore_missing : bool = False 

242 boolean telling whether missing values should be ignored (by default, a 

243 missing value of given attribute is always considered as not fulfilling the 

244 condition build upon that attribute); default: False. 

245 max_uncovered_fraction : float = 0.0 

246 Floating-point number from [0,1] interval representing maximum fraction of 

247 examples that may remain uncovered by the rule set, default: 0.0. 

248 select_best_candidate : bool = False 

249 Flag determining if best candidate should be selected from growing phase; 

250 default: False. 

251 complementary_conditions : bool = False 

252 If enabled, complementary conditions in the form a = !{value} for nominal 

253 attributes are supported. 

254 mean_based_regression : bool = True 

255 Enable fast induction of mean-based regression rules instead of default 

256 median-based. 

257 max_rule_count : int = 0 

258 Maximum number of rules to be generated (for classification data sets it 

259 applies to a single class); 0 indicates no limit. 

260 

261 extend_using_preferred : bool = False 

262 boolean indicating whether initial rules should be extended with a use of 

263 preferred conditions and attributes; default is False 

264 extend_using_automatic : bool = False 

265 boolean indicating whether initial rules should be extended with a use of 

266 automatic conditions and attributes; default is False 

267 induce_using_preferred : bool = False 

268 boolean indicating whether new rules should be induced with a use of 

269 preferred conditions and attributes; default is False 

270 induce_using_automatic : bool = False 

271 boolean indicating whether new rules should be induced with a use of 

272 automatic conditions and attributes; default is False 

273 preferred_conditions_per_rule : int = None 

274 maximum number of preferred conditions per rule; default: unlimited, 

275 preferred_attributes_per_rule : int = None 

276 maximum number of preferred attributes per rule; default: unlimited. 

277 """ 

278 RuleRegressor.__init__( 

279 self, 

280 minsupp_new=minsupp_new, 

281 induction_measure=induction_measure, 

282 pruning_measure=pruning_measure, 

283 voting_measure=voting_measure, 

284 max_growing=max_growing, 

285 enable_pruning=enable_pruning, 

286 ignore_missing=ignore_missing, 

287 max_uncovered_fraction=max_uncovered_fraction, 

288 select_best_candidate=select_best_candidate, 

289 complementary_conditions=complementary_conditions, 

290 mean_based_regression=mean_based_regression, 

291 max_rule_count=max_rule_count, 

292 ) 

293 ExpertKnowledgeOperator.__init__( 

294 self, 

295 minsupp_new=minsupp_new, 

296 induction_measure=induction_measure, 

297 pruning_measure=pruning_measure, 

298 voting_measure=voting_measure, 

299 max_growing=max_growing, 

300 enable_pruning=enable_pruning, 

301 ignore_missing=ignore_missing, 

302 max_uncovered_fraction=max_uncovered_fraction, 

303 select_best_candidate=select_best_candidate, 

304 extend_using_preferred=extend_using_preferred, 

305 extend_using_automatic=extend_using_automatic, 

306 induce_using_preferred=induce_using_preferred, 

307 induce_using_automatic=induce_using_automatic, 

308 preferred_conditions_per_rule=preferred_conditions_per_rule, 

309 preferred_attributes_per_rule=preferred_attributes_per_rule, 

310 complementary_conditions=complementary_conditions, 

311 mean_based_regression=mean_based_regression, 

312 max_rule_count=max_rule_count, 

313 ) 

314 self.model: RuleSet[RegressionRule] = None 

315 

316 def fit( # pylint: disable=arguments-differ,too-many-arguments 

317 self, 

318 values: Data, 

319 labels: Data, 

320 expert_rules: list[Union[str, tuple[str, str]]] = None, 

321 expert_preferred_conditions: list[Union[str, tuple[str, str]]] = None, 

322 expert_forbidden_conditions: list[Union[str, tuple[str, str]]] = None, 

323 ) -> ExpertRuleRegressor: 

324 """Train model on given dataset. 

325 

326 Parameters 

327 ---------- 

328 values : :class:`rulekit.operator.Data` 

329 attributes 

330 labels : :class:`rulekit.operator.Data` 

331 target values 

332 

333 expert_rules : List[Union[str, Tuple[str, str]]] 

334 set of initial rules, either passed as a list of strings representing rules 

335 or as list of tuples where first element is name of the rule and second one 

336 is rule string. 

337 expert_preferred_conditions : List[Union[str, Tuple[str, str]]] 

338 multiset of preferred conditions (used also for specifying preferred 

339 attributes by using special value Any). Either passed as a list of strings 

340 representing rules or as list of tuples where first element is name of the 

341 rule and second one is rule string. 

342 expert_forbidden_conditions : List[Union[str, Tuple[str, str]]] 

343 set of forbidden conditions (used also for specifying forbidden attributes 

344 by using special valye Any). Either passed as a list of strings representing 

345 rules or as list of tuples where first element is name of the rule and 

346 second one is rule string. 

347 Returns 

348 ------- 

349 self : ExpertRuleRegressor 

350 """ 

351 self._validate_labels(labels) 

352 return ExpertKnowledgeOperator.fit( 

353 self, 

354 values, 

355 labels, 

356 expert_rules=expert_rules, 

357 expert_preferred_conditions=expert_preferred_conditions, 

358 expert_forbidden_conditions=expert_forbidden_conditions, 

359 ) 

360 

361 def predict(self, values: Data) -> np.ndarray: 

362 return self._map_result(ExpertKnowledgeOperator.predict(self, values)) 

363 

364 def _get_problem_type(self) -> ProblemType: 

365 return ProblemType.REGRESSION 

366 

367 

368class ContrastSetRuleRegressor(BaseOperator): 

369 """Contrast set regression model.""" 

370 

371 __params_class__ = ContrastSetModelParams 

372 

373 def __init__( # pylint: disable=too-many-arguments,too-many-locals 

374 self, 

375 minsupp_all: Tuple[float, float, float, float] = DEFAULT_PARAMS_VALUE[ 

376 "minsupp_all" 

377 ], 

378 max_neg2pos: float = DEFAULT_PARAMS_VALUE["max_neg2pos"], 

379 max_passes_count: int = DEFAULT_PARAMS_VALUE["max_passes_count"], 

380 penalty_strength: float = DEFAULT_PARAMS_VALUE["penalty_strength"], 

381 penalty_saturation: float = DEFAULT_PARAMS_VALUE["penalty_saturation"], 

382 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"], 

383 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"], 

384 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"], 

385 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"], 

386 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"], 

387 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"], 

388 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"], 

389 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"], 

390 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"], 

391 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[ 

392 "complementary_conditions" 

393 ], 

394 mean_based_regression: bool = DEFAULT_PARAMS_VALUE["mean_based_regression"], 

395 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"], 

396 ): 

397 """ 

398 Parameters 

399 ---------- 

400 minsupp_all: Tuple[float, float, float, float] 

401 a minimum positive support of a contrast set (p/P). When multiple values are 

402 specified, a metainduction is performed; Default and recommended sequence 

403 is: 0.8, 0.5, 0.2, 0.1 

404 max_neg2pos: float 

405 a maximum ratio of negative to positive supports (nP/pN); Default is 0.5 

406 max_passes_count: int 

407 a maximum number of sequential covering passes for a single minsupp-all; 

408 Default is 5 

409 penalty_strength: float 

410 (s) - penalty strength; Default is 0.5 

411 penalty_saturation: float 

412 the value of p_new / P at which penalty reward saturates; Default is 0.2. 

413 minsupp_new : float = 5.0 

414 a minimum number (or fraction, if value < 1.0) of previously uncovered 

415 examples to be covered by a new rule (positive examples for classification 

416 problems); default: 5, 

417 induction_measure : :class:`rulekit.params.Measures` = \ 

418 :class:`rulekit.params.Measures.Correlation` 

419 measure used during induction; default measure is correlation 

420 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \ 

421 :class:`rulekit.params.Measures.Correlation` 

422 measure used during pruning. Could be user defined (string), for example 

423 :code:`2 * p / n`; default measure is correlation 

424 voting_measure : :class:`rulekit.params.Measures` = \ 

425 :class:`rulekit.params.Measures.Correlation` 

426 measure used during voting; default measure is correlation 

427 max_growing : int = 0.0 

428 non-negative integer representing maximum number of conditions which can be 

429 added to the rule in the growing phase (use this parameter for large 

430 datasets if execution time is prohibitive); 0 indicates no limit; default: 0 

431 enable_pruning : bool = True 

432 enable or disable pruning, default is True. 

433 ignore_missing : bool = False 

434 boolean telling whether missing values should be ignored (by default, a 

435 missing value of given attribute is always considered as not fulfilling the 

436 condition build upon that attribute); default: False. 

437 max_uncovered_fraction : float = 0.0 

438 Floating-point number from [0,1] interval representing maximum fraction of 

439 examples that may remain uncovered by the rule set, default: 0.0. 

440 select_best_candidate : bool = False 

441 Flag determining if best candidate should be selected from growing phase; 

442 default: False. 

443 complementary_conditions : bool = False 

444 If enabled, complementary conditions in the form a = !{value} for nominal 

445 attributes are supported. 

446 mean_based_regression : bool = True 

447 Enable fast induction of mean-based regression rules instead of default 

448 median-based. 

449 max_rule_count : int = 0 

450 Maximum number of rules to be generated; 0 indicates no limit. 

451 """ 

452 super().__init__( 

453 minsupp_all=minsupp_all, 

454 max_neg2pos=max_neg2pos, 

455 max_passes_count=max_passes_count, 

456 penalty_strength=penalty_strength, 

457 penalty_saturation=penalty_saturation, 

458 minsupp_new=minsupp_new, 

459 induction_measure=induction_measure, 

460 pruning_measure=pruning_measure, 

461 voting_measure=voting_measure, 

462 max_growing=max_growing, 

463 enable_pruning=enable_pruning, 

464 ignore_missing=ignore_missing, 

465 max_uncovered_fraction=max_uncovered_fraction, 

466 select_best_candidate=select_best_candidate, 

467 complementary_conditions=complementary_conditions, 

468 mean_based_regression=mean_based_regression, 

469 max_rule_count=max_rule_count, 

470 ) 

471 self.contrast_attribute: str = None 

472 self.model: RuleSet[RegressionRule] = None 

473 

474 def fit( 

475 self, values: Data, labels: Data, contrast_attribute: str 

476 ) -> ContrastSetRuleRegressor: # pylint: disable=arguments-differ 

477 """Train model on given dataset. 

478 

479 Parameters 

480 ---------- 

481 values : :class:`rulekit.operator.Data` 

482 attributes 

483 labels : :class:`rulekit.operator.Data` 

484 target values 

485 contrast_attribute: str 

486 group attribute 

487 Returns 

488 ------- 

489 self : ContrastSetRuleRegressor 

490 """ 

491 RuleRegressor._validate_labels(self, labels) # pylint: disable=protected-access 

492 super().fit(values, labels, contrast_attribute=contrast_attribute) 

493 self.contrast_attribute = contrast_attribute 

494 return self 

495 

496 def predict(self, values: Data) -> np.ndarray: 

497 """Perform prediction and returns predicted values. 

498 

499 Parameters 

500 ---------- 

501 values : :class:`rulekit.operator.Data` 

502 attributes 

503 

504 Returns 

505 ------- 

506 result : np.ndarray 

507 predicted values 

508 """ 

509 return RuleRegressor.predict(self, values) 

510 

511 def score(self, values: Data, labels: Data) -> float: 

512 """Return the coefficient of determination R2 of the prediction 

513 

514 Parameters 

515 ---------- 

516 values : :class:`rulekit.operator.Data` 

517 attributes 

518 labels : :class:`rulekit.operator.Data` 

519 true target values 

520 

521 Returns 

522 ------- 

523 score : float 

524 R2 of self.predict(values) wrt. labels. 

525 """ 

526 return RuleRegressor.score(self, values, labels) 

527 

528 def __getstate__(self) -> dict: 

529 return { 

530 **BaseOperator.__getstate__(self), 

531 **{ 

532 "contrast_attribute": self.contrast_attribute, 

533 }, 

534 } 

535 

536 def __setstate__(self, state: dict): 

537 BaseOperator.__setstate__(self, state) 

538 self.contrast_attribute = state["contrast_attribute"] 

539 

540 def _get_problem_type(self) -> ProblemType: 

541 return ProblemType.CONTRAST_REGRESSION