Coverage for rulekit/regression.py: 80%

1"""Module containing classes for regression analysis and prediction.

2"""

3from __future__ import annotations

5from numbers import Number

6from typing import Tuple

7from typing import Union

9import numpy as np

10import pandas as pd

11from sklearn import metrics

13from rulekit._helpers import PredictionResultMapper

14from rulekit._operator import BaseOperator

15from rulekit._operator import Data

16from rulekit._operator import ExpertKnowledgeOperator

17from rulekit._problem_types import ProblemType

18from rulekit.params import ContrastSetModelParams

19from rulekit.params import DEFAULT_PARAMS_VALUE

20from rulekit.params import ExpertModelParams

21from rulekit.params import Measures

22from rulekit.params import ModelsParams

23from rulekit.rules import RegressionRule

24from rulekit.rules import RuleSet

27class _RegressionModelParams(ModelsParams):

28 mean_based_regression: bool = DEFAULT_PARAMS_VALUE["mean_based_regression"]

31class _RegressionExpertModelParams(_RegressionModelParams, ExpertModelParams):

32 pass

35class RuleRegressor(BaseOperator):

36 """Regression model."""

38 __params_class__ = _RegressionModelParams

40 def __init__( # pylint: disable=too-many-arguments

41 self,

42 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"],

43 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"],

44 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"],

45 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"],

46 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"],

47 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"],

48 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"],

49 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"],

50 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"],

51 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[

52 "complementary_conditions"

53 ],

54 mean_based_regression: bool = DEFAULT_PARAMS_VALUE["mean_based_regression"],

55 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"],

56 ):

57 """

58 Parameters

59 ----------

60 minsupp_new : float = 5.0

61 a minimum number (or fraction, if value < 1.0) of previously uncovered

62 examples to be covered by a new rule (positive examples for classification

63 problems); default: 5,

64 induction_measure : :class:`rulekit.params.Measures` = \

65 :class:`rulekit.params.Measures.Correlation`

66 measure used during induction; default measure is correlation

67 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \

68 :class:`rulekit.params.Measures.Correlation`

69 measure used during pruning. Could be user defined (string), for example

70 :code:`2 * p / n`; default measure is correlation

71 voting_measure : :class:`rulekit.params.Measures` = \

72 :class:`rulekit.params.Measures.Correlation`

73 measure used during voting; default measure is correlation

74 max_growing : int = 0.0

75 non-negative integer representing maximum number of conditions which can be

76 added to the rule in the growing phase (use this parameter for large

77 datasets if execution time is prohibitive); 0 indicates no limit; default: 0

78 enable_pruning : bool = True

79 enable or disable pruning, default is True.

80 ignore_missing : bool = False

81 boolean telling whether missing values should be ignored (by default, a

82 missing value of given attribute is always considered as not fulfilling the

83 condition build upon that attribute); default: False.

84 max_uncovered_fraction : float = 0.0

85 Floating-point number from [0,1] interval representing maximum fraction of

86 examples that may remain uncovered by the rule set, default: 0.0.

87 select_best_candidate : bool = False

88 Flag determining if best candidate should be selected from growing phase;

89 default: False.

90 complementary_conditions : bool = False

91 If enabled, complementary conditions in the form a = !{value} for nominal

92 attributes are supported.

93 mean_based_regression : bool = True

94 Enable fast induction of mean-based regression rules instead of default

95 median-based.

96 max_rule_count : int = 0

97 Maximum number of rules to be generated; 0 indicates no limit. Default: 0

98 """

99 super().__init__(

100 minsupp_new=minsupp_new,

101 induction_measure=induction_measure,

102 pruning_measure=pruning_measure,

103 voting_measure=voting_measure,

104 max_growing=max_growing,

105 enable_pruning=enable_pruning,

106 ignore_missing=ignore_missing,

107 max_uncovered_fraction=max_uncovered_fraction,

108 select_best_candidate=select_best_candidate,

109 complementary_conditions=complementary_conditions,

110 mean_based_regression=mean_based_regression,

111 max_rule_count=max_rule_count,

112 )

113 self.model: RuleSet[RegressionRule] = None

114

115 def _validate_labels(self, labels: Data):

116 if isinstance(labels, (pd.DataFrame, pd.Series)):

117 first_label = labels.iloc[0]

118 else:

119 first_label = labels[0]

120 if not isinstance(first_label, Number):

121 raise ValueError(

122 f"{self.__class__.__name__} requires lables values to be numeric"

123 )

124

125 def fit( # pylint: disable=arguments-differ

126 self, values: Data, labels: Data

127 ) -> RuleRegressor:

128 """Train model on given dataset.

129

130 Parameters

131 ----------

132 values : :class:`rulekit.operator.Data`

133 attributes

134 labels : :class:`rulekit.operator.Data`

135 target values

136 Returns

137 -------

138 self : RuleRegressor

139 """

140 self._validate_labels(labels)

141 super().fit(values, labels)

142 return self

143

144 def predict(self, values: Data) -> np.ndarray:

145 """Perform prediction and returns predicted values.

146

147 Parameters

148 ----------

149 values : :class:`rulekit.operator.Data`

150 attributes

151

152 Returns

153 -------

154 result : np.ndarray

155 predicted values

156 """

157 return self._map_result(super().predict(values))

158

159 def score(self, values: Data, labels: Data) -> float:

160 """Return the coefficient of determination R2 of the prediction

161

162 Parameters

163 ----------

164 values : :class:`rulekit.operator.Data`

165 attributes

166 labels : :class:`rulekit.operator.Data`

167 true target values

168

169 Returns

170 -------

171 score : float

172 R2 of self.predict(values) wrt. labels.

173 """

174 predicted_labels = self.predict(values)

175 return metrics.r2_score(labels, predicted_labels)

176

177 def _map_result(self, predicted_example_set) -> np.ndarray:

178 return PredictionResultMapper.map_to_numerical(

179 predicted_example_set, remap=False

180 )

181

182 def _get_problem_type(self) -> ProblemType:

183 return ProblemType.REGRESSION

184

185

186class ExpertRuleRegressor(ExpertKnowledgeOperator, RuleRegressor):

187 """Expert Regression model."""

188

189 __params_class__ = _RegressionExpertModelParams

190

191 def __init__( # pylint: disable=too-many-arguments,too-many-locals

192 self,

193 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"],

194 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"],

195 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"],

196 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"],

197 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"],

198 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"],

199 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"],

200 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"],

201 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"],

202 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[

203 "complementary_conditions"

204 ],

205 mean_based_regression: bool = DEFAULT_PARAMS_VALUE["mean_based_regression"],

206 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"],

207 extend_using_preferred: bool = DEFAULT_PARAMS_VALUE["extend_using_preferred"],

208 extend_using_automatic: bool = DEFAULT_PARAMS_VALUE["extend_using_automatic"],

209 induce_using_preferred: bool = DEFAULT_PARAMS_VALUE["induce_using_preferred"],

210 induce_using_automatic: bool = DEFAULT_PARAMS_VALUE["induce_using_automatic"],

211 preferred_conditions_per_rule: int = DEFAULT_PARAMS_VALUE[

212 "preferred_conditions_per_rule"

213 ],

214 preferred_attributes_per_rule: int = DEFAULT_PARAMS_VALUE[

215 "preferred_attributes_per_rule"

216 ],

217 ):

218 """

219 Parameters

220 ----------

221 minsupp_new : float = 5.0

222 a minimum number (or fraction, if value < 1.0) of previously uncovered

223 examples to be covered by a new rule (positive examples for classification

224 problems); default: 5,

225 induction_measure : :class:`rulekit.params.Measures` = \

226 :class:`rulekit.params.Measures.Correlation`

227 measure used during induction; default measure is correlation

228 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \

229 :class:`rulekit.params.Measures.Correlation`

230 measure used during pruning. Could be user defined (string), for example

231 :code:`2 * p / n`; default measure is correlation

232 voting_measure : :class:`rulekit.params.Measures` = \

233 :class:`rulekit.params.Measures.Correlation`

234 measure used during voting; default measure is correlation

235 max_growing : int = 0.0

236 non-negative integer representing maximum number of conditions which can be

237 added to the rule in the growing phase (use this parameter for large

238 datasets if execution time is prohibitive); 0 indicates no limit; default: 0,

239 enable_pruning : bool = True

240 enable or disable pruning, default is True.

241 ignore_missing : bool = False

242 boolean telling whether missing values should be ignored (by default, a

243 missing value of given attribute is always considered as not fulfilling the

244 condition build upon that attribute); default: False.

245 max_uncovered_fraction : float = 0.0

246 Floating-point number from [0,1] interval representing maximum fraction of

247 examples that may remain uncovered by the rule set, default: 0.0.

248 select_best_candidate : bool = False

249 Flag determining if best candidate should be selected from growing phase;

250 default: False.

251 complementary_conditions : bool = False

252 If enabled, complementary conditions in the form a = !{value} for nominal

253 attributes are supported.

254 mean_based_regression : bool = True

255 Enable fast induction of mean-based regression rules instead of default

256 median-based.

257 max_rule_count : int = 0

258 Maximum number of rules to be generated (for classification data sets it

259 applies to a single class); 0 indicates no limit.

260

261 extend_using_preferred : bool = False

262 boolean indicating whether initial rules should be extended with a use of

263 preferred conditions and attributes; default is False

264 extend_using_automatic : bool = False

265 boolean indicating whether initial rules should be extended with a use of

266 automatic conditions and attributes; default is False

267 induce_using_preferred : bool = False

268 boolean indicating whether new rules should be induced with a use of

269 preferred conditions and attributes; default is False

270 induce_using_automatic : bool = False

271 boolean indicating whether new rules should be induced with a use of

272 automatic conditions and attributes; default is False

273 preferred_conditions_per_rule : int = None

274 maximum number of preferred conditions per rule; default: unlimited,

275 preferred_attributes_per_rule : int = None

276 maximum number of preferred attributes per rule; default: unlimited.

277 """

278 RuleRegressor.__init__(

279 self,

280 minsupp_new=minsupp_new,

281 induction_measure=induction_measure,

282 pruning_measure=pruning_measure,

283 voting_measure=voting_measure,

284 max_growing=max_growing,

285 enable_pruning=enable_pruning,

286 ignore_missing=ignore_missing,

287 max_uncovered_fraction=max_uncovered_fraction,

288 select_best_candidate=select_best_candidate,

289 complementary_conditions=complementary_conditions,

290 mean_based_regression=mean_based_regression,

291 max_rule_count=max_rule_count,

292 )

293 ExpertKnowledgeOperator.__init__(

294 self,

295 minsupp_new=minsupp_new,

296 induction_measure=induction_measure,

297 pruning_measure=pruning_measure,

298 voting_measure=voting_measure,

299 max_growing=max_growing,

300 enable_pruning=enable_pruning,

301 ignore_missing=ignore_missing,

302 max_uncovered_fraction=max_uncovered_fraction,

303 select_best_candidate=select_best_candidate,

304 extend_using_preferred=extend_using_preferred,

305 extend_using_automatic=extend_using_automatic,

306 induce_using_preferred=induce_using_preferred,

307 induce_using_automatic=induce_using_automatic,

308 preferred_conditions_per_rule=preferred_conditions_per_rule,

309 preferred_attributes_per_rule=preferred_attributes_per_rule,

310 complementary_conditions=complementary_conditions,

311 mean_based_regression=mean_based_regression,

312 max_rule_count=max_rule_count,

313 )

314 self.model: RuleSet[RegressionRule] = None

315

316 def fit( # pylint: disable=arguments-differ,too-many-arguments

317 self,

318 values: Data,

319 labels: Data,

320 expert_rules: list[Union[str, tuple[str, str]]] = None,

321 expert_preferred_conditions: list[Union[str, tuple[str, str]]] = None,

322 expert_forbidden_conditions: list[Union[str, tuple[str, str]]] = None,

323 ) -> ExpertRuleRegressor:

324 """Train model on given dataset.

325

326 Parameters

327 ----------

328 values : :class:`rulekit.operator.Data`

329 attributes

330 labels : :class:`rulekit.operator.Data`

331 target values

332

333 expert_rules : List[Union[str, Tuple[str, str]]]

334 set of initial rules, either passed as a list of strings representing rules

335 or as list of tuples where first element is name of the rule and second one

336 is rule string.

337 expert_preferred_conditions : List[Union[str, Tuple[str, str]]]

338 multiset of preferred conditions (used also for specifying preferred

339 attributes by using special value Any). Either passed as a list of strings

340 representing rules or as list of tuples where first element is name of the

341 rule and second one is rule string.

342 expert_forbidden_conditions : List[Union[str, Tuple[str, str]]]

343 set of forbidden conditions (used also for specifying forbidden attributes

344 by using special valye Any). Either passed as a list of strings representing

345 rules or as list of tuples where first element is name of the rule and

346 second one is rule string.

347 Returns

348 -------

349 self : ExpertRuleRegressor

350 """

351 self._validate_labels(labels)

352 return ExpertKnowledgeOperator.fit(

353 self,

354 values,

355 labels,

356 expert_rules=expert_rules,

357 expert_preferred_conditions=expert_preferred_conditions,

358 expert_forbidden_conditions=expert_forbidden_conditions,

359 )

360

361 def predict(self, values: Data) -> np.ndarray:

362 return self._map_result(ExpertKnowledgeOperator.predict(self, values))

363

364 def _get_problem_type(self) -> ProblemType:

365 return ProblemType.REGRESSION

366

367

368class ContrastSetRuleRegressor(BaseOperator):

369 """Contrast set regression model."""

370

371 __params_class__ = ContrastSetModelParams

372

373 def __init__( # pylint: disable=too-many-arguments,too-many-locals

374 self,

375 minsupp_all: Tuple[float, float, float, float] = DEFAULT_PARAMS_VALUE[

376 "minsupp_all"

377 ],

378 max_neg2pos: float = DEFAULT_PARAMS_VALUE["max_neg2pos"],

379 max_passes_count: int = DEFAULT_PARAMS_VALUE["max_passes_count"],

380 penalty_strength: float = DEFAULT_PARAMS_VALUE["penalty_strength"],

381 penalty_saturation: float = DEFAULT_PARAMS_VALUE["penalty_saturation"],

382 minsupp_new: float = DEFAULT_PARAMS_VALUE["minsupp_new"],

383 induction_measure: Measures = DEFAULT_PARAMS_VALUE["induction_measure"],

384 pruning_measure: Union[Measures, str] = DEFAULT_PARAMS_VALUE["pruning_measure"],

385 voting_measure: Measures = DEFAULT_PARAMS_VALUE["voting_measure"],

386 max_growing: float = DEFAULT_PARAMS_VALUE["max_growing"],

387 enable_pruning: bool = DEFAULT_PARAMS_VALUE["enable_pruning"],

388 ignore_missing: bool = DEFAULT_PARAMS_VALUE["ignore_missing"],

389 max_uncovered_fraction: float = DEFAULT_PARAMS_VALUE["max_uncovered_fraction"],

390 select_best_candidate: bool = DEFAULT_PARAMS_VALUE["select_best_candidate"],

391 complementary_conditions: bool = DEFAULT_PARAMS_VALUE[

392 "complementary_conditions"

393 ],

394 mean_based_regression: bool = DEFAULT_PARAMS_VALUE["mean_based_regression"],

395 max_rule_count: int = DEFAULT_PARAMS_VALUE["max_rule_count"],

396 ):

397 """

398 Parameters

399 ----------

400 minsupp_all: Tuple[float, float, float, float]

401 a minimum positive support of a contrast set (p/P). When multiple values are

402 specified, a metainduction is performed; Default and recommended sequence

403 is: 0.8, 0.5, 0.2, 0.1

404 max_neg2pos: float

405 a maximum ratio of negative to positive supports (nP/pN); Default is 0.5

406 max_passes_count: int

407 a maximum number of sequential covering passes for a single minsupp-all;

408 Default is 5

409 penalty_strength: float

410 (s) - penalty strength; Default is 0.5

411 penalty_saturation: float

412 the value of p_new / P at which penalty reward saturates; Default is 0.2.

413 minsupp_new : float = 5.0

414 a minimum number (or fraction, if value < 1.0) of previously uncovered

415 examples to be covered by a new rule (positive examples for classification

416 problems); default: 5,

417 induction_measure : :class:`rulekit.params.Measures` = \

418 :class:`rulekit.params.Measures.Correlation`

419 measure used during induction; default measure is correlation

420 pruning_measure : Union[:class:`rulekit.params.Measures`, str] = \

421 :class:`rulekit.params.Measures.Correlation`

422 measure used during pruning. Could be user defined (string), for example

423 :code:`2 * p / n`; default measure is correlation

424 voting_measure : :class:`rulekit.params.Measures` = \

425 :class:`rulekit.params.Measures.Correlation`

426 measure used during voting; default measure is correlation

427 max_growing : int = 0.0

428 non-negative integer representing maximum number of conditions which can be

429 added to the rule in the growing phase (use this parameter for large

430 datasets if execution time is prohibitive); 0 indicates no limit; default: 0

431 enable_pruning : bool = True

432 enable or disable pruning, default is True.

433 ignore_missing : bool = False

434 boolean telling whether missing values should be ignored (by default, a

435 missing value of given attribute is always considered as not fulfilling the

436 condition build upon that attribute); default: False.

437 max_uncovered_fraction : float = 0.0

438 Floating-point number from [0,1] interval representing maximum fraction of

439 examples that may remain uncovered by the rule set, default: 0.0.

440 select_best_candidate : bool = False

441 Flag determining if best candidate should be selected from growing phase;

442 default: False.

443 complementary_conditions : bool = False

444 If enabled, complementary conditions in the form a = !{value} for nominal

445 attributes are supported.

446 mean_based_regression : bool = True

447 Enable fast induction of mean-based regression rules instead of default

448 median-based.

449 max_rule_count : int = 0

450 Maximum number of rules to be generated; 0 indicates no limit.

451 """

452 super().__init__(

453 minsupp_all=minsupp_all,

454 max_neg2pos=max_neg2pos,

455 max_passes_count=max_passes_count,

456 penalty_strength=penalty_strength,

457 penalty_saturation=penalty_saturation,

458 minsupp_new=minsupp_new,

459 induction_measure=induction_measure,

460 pruning_measure=pruning_measure,

461 voting_measure=voting_measure,

462 max_growing=max_growing,

463 enable_pruning=enable_pruning,

464 ignore_missing=ignore_missing,

465 max_uncovered_fraction=max_uncovered_fraction,

466 select_best_candidate=select_best_candidate,

467 complementary_conditions=complementary_conditions,

468 mean_based_regression=mean_based_regression,

469 max_rule_count=max_rule_count,

470 )

471 self.contrast_attribute: str = None

472 self.model: RuleSet[RegressionRule] = None

473

474 def fit(

475 self, values: Data, labels: Data, contrast_attribute: str

476 ) -> ContrastSetRuleRegressor: # pylint: disable=arguments-differ

477 """Train model on given dataset.

478

479 Parameters

480 ----------

481 values : :class:`rulekit.operator.Data`

482 attributes

483 labels : :class:`rulekit.operator.Data`

484 target values

485 contrast_attribute: str

486 group attribute

487 Returns

488 -------

489 self : ContrastSetRuleRegressor

490 """

491 RuleRegressor._validate_labels(self, labels) # pylint: disable=protected-access

492 super().fit(values, labels, contrast_attribute=contrast_attribute)

493 self.contrast_attribute = contrast_attribute

494 return self

495

496 def predict(self, values: Data) -> np.ndarray:

497 """Perform prediction and returns predicted values.

498

499 Parameters

500 ----------

501 values : :class:`rulekit.operator.Data`

502 attributes

503

504 Returns

505 -------

506 result : np.ndarray

507 predicted values

508 """

509 return RuleRegressor.predict(self, values)

510

511 def score(self, values: Data, labels: Data) -> float:

512 """Return the coefficient of determination R2 of the prediction

513

514 Parameters

515 ----------

516 values : :class:`rulekit.operator.Data`

517 attributes

518 labels : :class:`rulekit.operator.Data`

519 true target values

520

521 Returns

522 -------

523 score : float

524 R2 of self.predict(values) wrt. labels.

525 """

526 return RuleRegressor.score(self, values, labels)

527

528 def __getstate__(self) -> dict:

529 return {

530 **BaseOperator.__getstate__(self),

531 **{

532 "contrast_attribute": self.contrast_attribute,

533 },

534 }

535

536 def __setstate__(self, state: dict):

537 BaseOperator.__setstate__(self, state)

538 self.contrast_attribute = state["contrast_attribute"]

539

540 def _get_problem_type(self) -> ProblemType:

541 return ProblemType.CONTRAST_REGRESSION