Expert Rules
This notebook presents example usage of user-guided rule induction which follows the scheme introduced by the GuideR algorithm (Sikora et al, 2019).
Each problem (classification, regression, survival) in addition to the basic class has an expert class, i.e. RuleClassifier and ExpertRuleClassifier. Expert classes allow you to define set of initial rules, preferred conditions and forbidden conditions.
This tutorial will show you how to define rules and conditions
Classification
Load dataset
[2]:
import pandas as pd
from rulekit.arff import read_arff
CLASSIFICATION_DATASET_URL: str = (
'https://raw.githubusercontent.com/'
'adaa-polsl/RuleKit/refs/heads/master/data/seismic-bumps/'
'seismic-bumps.arff'
)
df: pd.DataFrame = read_arff(CLASSIFICATION_DATASET_URL)
X, y = df.drop(['class'], axis=1), df['class']
Define expert knowledge
[3]:
expert_rules: list[tuple[str, str]] = [
('rule-0', 'IF [[gimpuls = <-inf, 750)]] THEN class = {0}'),
('rule-1', 'IF [[gimpuls = <750, inf)]] THEN class = {1}')
]
expert_preferred_conditions: list[tuple[str, str]] = [
('preferred-condition-0', '1: IF [[seismic = {a}]] THEN class = {0}'),
('preferred-attribute-0', '1: IF [[gimpuls = Any]] THEN class = {1}')
]
expert_forbidden_conditions: list[tuple[str, str]] = [
('forb-attribute-0', '1: IF [[seismoacoustic = Any]] THEN class = {0}'),
('forb-attribute-1', 'inf: IF [[ghazard = Any]] THEN class = {1}')
]
Rule induction
[4]:
from rulekit.classification import ExpertRuleClassifier
from rulekit.rules import RuleSet, ClassificationRule
clf = ExpertRuleClassifier(
minsupp_new=8,
max_growing=0,
extend_using_preferred=True,
extend_using_automatic=True,
induce_using_preferred=True,
induce_using_automatic=True
)
clf.fit(
X, y,
expert_rules=expert_rules,
expert_preferred_conditions=expert_preferred_conditions,
expert_forbidden_conditions=expert_forbidden_conditions
)
ruleset: RuleSet[ClassificationRule] = clf.model
[5]:
for rule in ruleset.rules:
print(rule)
IF [[gimpuls = <-inf, 750)]] AND [seismic = {a}] AND nbumps = (-inf, 1.50) AND nbumps4 = (-inf, 0.50) THEN class = {0}
IF nbumps = (-inf, 1.50) AND gimpuls = (-inf, 1252.50) THEN class = {0}
IF nbumps = (-inf, 1.50) AND gimpuls = (-inf, 1342.50) AND goimpuls = (-inf, 312) THEN class = {0}
IF nbumps = (-inf, 1.50) AND gimpuls = (-inf, 1427.50) THEN class = {0}
IF nbumps = (-inf, 1.50) AND gimpuls = (-inf, 1653.50) AND genergy = (-inf, 1006585) AND goimpuls = (-inf, 312) THEN class = {0}
IF nbumps = (-inf, 1.50) AND gimpuls = (-inf, 1752) THEN class = {0}
IF nbumps = (-inf, 1.50) AND gimpuls = (-inf, 2733) AND goimpuls = (-inf, 312) THEN class = {0}
IF nbumps = (-inf, 1.50) AND genergy = <634250, inf) AND gimpuls = <2965, inf) THEN class = {0}
IF nbumps = (-inf, 2.50) AND gimpuls = (-inf, 1331) THEN class = {0}
IF nbumps = (-inf, 2.50) AND gimpuls = (-inf, 1655.50) AND genergy = (-inf, 386010) THEN class = {0}
IF nbumps = (-inf, 2.50) AND gimpuls = (-inf, 1686) AND nbumps2 = (-inf, 1.50) AND goimpuls = (-inf, 312) AND nbumps5 = (-inf, 0.50) THEN class = {0}
IF nbumps = (-inf, 2.50) AND genergy = (-inf, 386010) AND gimpuls = (-inf, 2892) AND goimpuls = (-inf, 312) THEN class = {0}
IF nbumps = (-inf, 2.50) AND gimpuls = (-inf, 2068.50) AND goimpuls = (-inf, 312) AND nbumps2 = (-inf, 1.50) AND genergy = (-inf, 1004565) THEN class = {0}
IF nbumps = (-inf, 2.50) AND gimpuls = (-inf, 2184.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps3 = (-inf, 1.50) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF nbumps = (-inf, 3.50) AND goimpuls = (-inf, 96.50) AND gimpuls = (-inf, 901) AND senergy = (-inf, 3850) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps3 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps2 = (-inf, 1.50) AND senergy = (-inf, 9600) AND nbumps3 = (-inf, 2.50) AND goimpuls = (-inf, 312) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps2 = (-inf, 1.50) AND nbumps3 = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND senergy = (-inf, 8100) AND nbumps2 = (-inf, 2.50) AND goimpuls = (-inf, 312) THEN class = {0}
IF maxenergy = (-inf, 5500) AND gimpuls = (-inf, 901) AND goenergy = <-40.50, 68.50) AND ghazard = {a} AND goimpuls = <-39.50, inf) AND senergy = <1150, inf) AND nbumps2 = <1.50, inf) THEN class = {0}
IF nbumps2 = (-inf, 1.50) AND nbumps3 = (-inf, 3.50) AND nbumps = (-inf, 6.50) AND gimpuls = (-inf, 695.50) AND goimpuls = <-54.50, inf) AND goenergy = <-48.50, inf) AND genergy = <10915, inf) AND maxenergy = <2500, inf) AND senergy = <3950, inf) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps = (-inf, 4.50) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps3 = (-inf, 2.50) AND nbumps = (-inf, 5.50) THEN class = {0}
IF nbumps3 = (-inf, 3.50) AND nbumps4 = (-inf, 2.50) AND maxenergy = (-inf, 75000) AND genergy = (-inf, 378500) AND gimpuls = (-inf, 901) THEN class = {0}
IF senergy = (-inf, 85450) AND goimpuls = (-inf, 312) AND gimpuls = (-inf, 1139.50) THEN class = {0}
IF nbumps2 = (-inf, 0.50) AND nbumps3 = (-inf, 2.50) AND goimpuls = <-35.50, inf) AND nbumps = <1.50, inf) AND gimpuls = <1150.50, inf) THEN class = {0}
IF senergy = (-inf, 5750) AND genergy = (-inf, 508210) AND goenergy = <-18.50, inf) AND nbumps2 = <1.50, inf) AND gimpuls = <927, inf) THEN class = {0}
IF senergy = (-inf, 5750) THEN class = {0}
IF nbumps3 = (-inf, 2.50) AND nbumps2 = (-inf, 2.50) AND gimpuls = (-inf, 2489.50) AND genergy = (-inf, 318735) THEN class = {0}
IF nbumps2 = (-inf, 1.50) AND goenergy = <-36.50, inf) AND goimpuls = (-inf, 6.50) AND genergy = <392530, inf) AND senergy = <6750, inf) THEN class = {0}
IF nbumps = (-inf, 4.50) AND nbumps2 = (-inf, 2.50) AND gimpuls = (-inf, 3881.50) THEN class = {0}
IF [[gimpuls = <750, inf)]] AND nbumps2 = <0.50, inf) AND genergy = <61250, 662435) AND maxenergy = <1500, inf) AND nbumps = (-inf, 7.50) AND nbumps3 = <0.50, inf) AND seismoacoustic = {a} AND goenergy = (-inf, 11) AND senergy = (-inf, 31200) THEN class = {1}
IF [gimpuls = <1253.50, inf)] AND genergy = <96260, 673155) AND seismic = {b} AND maxenergy = (-inf, 7500) AND goenergy = <-40.50, 87) AND seismoacoustic = {a} AND nbumps = (-inf, 3.50) AND senergy = (-inf, 10000) THEN class = {1}
IF nbumps2 = <0.50, inf) AND maxenergy = <1500, inf) AND gimpuls = <538.50, 1959) AND nbumps = (-inf, 6.50) AND senergy = (-inf, 36050) AND genergy = <61250, 662435) AND goenergy = (-inf, 96) AND nbumps3 = <0.50, 4.50) AND goimpuls = <-34, 95) THEN class = {1}
IF nbumps2 = <0.50, inf) AND genergy = <58310, 934630) AND goenergy = (-inf, 186) AND senergy = (-inf, 40650) AND maxenergy = <1500, inf) AND gimpuls = <538.50, inf) AND goimpuls = <-55, inf) THEN class = {1}
IF nbumps = <1.50, 4.50) AND nbumps2 = <0.50, 3.50) AND gimpuls = <521.50, inf) AND genergy = <58310, 799855) AND nbumps4 = (-inf, 1.50) AND senergy = <850, inf) AND goimpuls = <-39, 64.50) AND nbumps3 = (-inf, 2.50) THEN class = {1}
IF nbumps = <1.50, inf) AND nbumps2 = <0.50, 4.50) AND gimpuls = <521.50, inf) AND genergy = <34360, 1161025) AND goenergy = (-inf, 186) AND nbumps3 = (-inf, 6) AND maxenergy = <450, 45000) THEN class = {1}
IF nbumps = <1.50, inf) AND nbumps2 = <0.50, inf) AND genergy = <34880, inf) AND gimpuls = <281.50, inf) AND goenergy = (-inf, 135.50) THEN class = {1}
IF nbumps = <1.50, inf) AND nbumps2 = <0.50, inf) AND gimpuls = <153.50, 498) AND genergy = <18870, 33010) AND senergy = (-inf, 40500) AND goenergy = (-inf, 106.50) AND nbumps3 = (-inf, 1.50) THEN class = {1}
IF nbumps = <1.50, inf) AND goenergy = (-inf, 131) AND gimpuls = <176, inf) THEN class = {1}
IF gimpuls = <1253.50, inf) AND goenergy = (-inf, 131.50) AND genergy = <54930, 1062020) AND shift = {W} AND goimpuls = <-60.50, 109) AND senergy = (-inf, 36050) AND nbumps2 = (-inf, 2.50) THEN class = {1}
IF nbumps2 = <0.50, inf) AND gimpuls = <98.50, inf) AND goimpuls = <-70.50, inf) AND maxenergy = <550, inf) THEN class = {1}
IF goimpuls = <-74.50, inf) AND gimpuls = <32.50, inf) AND goenergy = <-78.50, inf) AND senergy = <850, inf) THEN class = {1}
IF genergy = <48545, inf) AND gimpuls = <131, inf) AND goenergy = (-inf, 176.50) THEN class = {1}
IF shift = {W} AND genergy = <32795, 49585) AND gimpuls = <396, 1445.50) AND goimpuls = <-19, inf) AND senergy = (-inf, 350) AND goenergy = <-4, inf) THEN class = {1}
IF genergy = <16805, 32020) AND gimpuls = <537.50, 796) AND goimpuls = <-36.50, inf) AND goenergy = <-37.50, inf) AND senergy = (-inf, 250) THEN class = {1}
IF shift = {W} AND genergy = <19670, 40735) AND gimpuls = <240, 470.50) AND goenergy = <-37.50, 181) AND goimpuls = <-42.50, inf) THEN class = {1}
IF gimpuls = <54.50, inf) AND senergy = (-inf, 115450) AND goimpuls = <-74.50, inf) AND genergy = <1510, inf) THEN class = {1}
Regression
Load dataset
[6]:
REGRESSION_DATSET_URL: str = (
'https://raw.githubusercontent.com/'
'adaa-polsl/RuleKit/master/data/methane/'
'methane-train.arff'
)
df: pd.DataFrame = read_arff(REGRESSION_DATSET_URL)
X, y = df.drop(['MM116_pred'], axis=1), df['MM116_pred']
[7]:
X
[7]:
| MM31 | MM116 | AS038 | PG072 | PD | BA13 | DMM116 | |
|---|---|---|---|---|---|---|---|
| 0 | 0.46 | 1.3 | 2.4 | 2.0 | 1.0 | 1076.0 | 0.0 |
| 1 | 0.46 | 1.3 | 2.2 | 1.9 | 1.0 | 1076.0 | 0.0 |
| 2 | 0.49 | 1.3 | 2.2 | 1.9 | 1.0 | 1076.0 | 0.0 |
| 3 | 0.50 | 1.3 | 2.3 | 1.9 | 1.0 | 1076.0 | 0.0 |
| 4 | 0.54 | 1.3 | 2.3 | 1.9 | 1.0 | 1076.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 13363 | 0.64 | 1.2 | 2.4 | 1.8 | 1.0 | 1077.0 | 0.0 |
| 13364 | 0.59 | 1.2 | 2.4 | 1.8 | 1.0 | 1077.0 | 0.0 |
| 13365 | 0.60 | 1.1 | 2.2 | 1.8 | 1.0 | 1077.0 | -0.1 |
| 13366 | 0.64 | 1.1 | 2.2 | 1.8 | 1.0 | 1077.0 | 0.0 |
| 13367 | 0.65 | 1.2 | 2.2 | 1.7 | 0.0 | 1077.0 | 0.1 |
13368 rows × 7 columns
Define rules and conditions
[21]:
expert_preferred_conditions = [
(
'preferred-condition-0',
'3: IF PD = <0.5, inf) THEN'
),
(
'preferred-condition-1',
'5: IF PD = <0.5, inf) AND MM116 = (-inf, 1.0) THEN'
)
]
expert_forbidden_conditions = [
('forb-attribute-0', 'inf: IF DMM116 = Any THEN')
]
Rule induction
[22]:
from rulekit.regression import ExpertRuleRegressor
from rulekit.rules import RegressionRule
from rulekit.exceptions import RuleKitJavaException
reg = ExpertRuleRegressor(
minsupp_new=5,
max_growing=0,
mean_based_regression=True,
extend_using_preferred=True,
extend_using_automatic=False,
induce_using_preferred=True,
induce_using_automatic=True
)
reg.fit(
X, y,
expert_preferred_conditions=expert_preferred_conditions,
expert_forbidden_conditions=expert_forbidden_conditions,
)
ruleset: RuleSet[RegressionRule] = reg.model
[23]:
for rule in ruleset.rules:
print(rule)
IF [PD = <0.50, inf)] AND PG072 = (-inf, 2.05) THEN MM116_pred = {1.01} [0.77,1.25]
IF [PD = <0.50, inf)] THEN MM116_pred = {1.01} [0.77,1.25]
IF MM31 = (-inf, 0.23) THEN MM116_pred = {0.40} [0.39,0.41]
IF MM116 = (-inf, 0.45) AND MM31 = <0.18, 0.24) THEN MM116_pred = {0.40} [0.38,0.42]
IF MM31 = (-inf, 0.25) THEN MM116_pred = {0.44} [0.37,0.51]
IF MM116 = <0.25, inf) AND AS038 = <2, 2.45) AND PD = (-inf, 0.50) AND PG072 = (-inf, 1.95) AND BA13 = (-inf, 1075.50) AND MM31 = <0.23, inf) THEN MM116_pred = {0.71} [0.50,0.93]
IF MM116 = (-inf, 0.25) AND BA13 = (-inf, 1075.50) AND MM31 = <0.19, inf) AND AS038 = <2.35, 2.45) AND PG072 = <1.75, 1.95) AND PD = (-inf, 0.50) THEN MM116_pred = {0.25} [0.20,0.30]
IF MM116 = (-inf, 0.45) AND BA13 = (-inf, 1077.50) AND MM31 = <0.18, inf) THEN MM116_pred = {0.40} [0.37,0.43]
IF MM116 = (-inf, 0.55) AND MM31 = (-inf, 0.32) THEN MM116_pred = {0.45} [0.39,0.51]
IF MM116 = <0.45, 0.65) THEN MM116_pred = {0.55} [0.49,0.61]
IF MM31 = <0.18, 0.27) AND MM116 = (-inf, 0.75) THEN MM116_pred = {0.46} [0.39,0.53]
IF MM116 = <0.45, 0.85) AND MM31 = <0.25, inf) THEN MM116_pred = {0.70} [0.56,0.84]
IF MM116 = <0.75, inf) THEN MM116_pred = {1.01} [0.82,1.19]
Survival
Load dataset
[24]:
SURVIVAL_DATASET_URL: str = (
'https://raw.githubusercontent.com/'
'adaa-polsl/RuleKit/master/data/bmt/'
'bmt.arff'
)
df: pd.DataFrame = read_arff(SURVIVAL_DATASET_URL)
df['survival_status'] = df['survival_status'].astype(int).astype(str)
X, y = df.drop(['survival_status'], axis=1), df['survival_status']
Define rules and conditions
[25]:
expert_rules = [
(
'rule-0',
'IF [[CD34kgx10d6 = (-inf, 10.0)]] AND [[extcGvHD = {0}]] THEN'
)
]
expert_preferred_conditions = [
(
'attr-preferred-0',
'inf: IF [CD34kgx10d6 = Any] THEN'
)
]
expert_forbidden_conditions = [
('attr-forbidden-0', 'IF [ANCrecovery = Any] THEN')
]
Rule induction
[26]:
from rulekit.survival import ExpertSurvivalRules
from rulekit.rules import SurvivalRule
srv = ExpertSurvivalRules(
survival_time_attr='survival_time',
minsupp_new=5,
max_growing=0,
extend_using_preferred=False,
extend_using_automatic=False,
induce_using_preferred=True,
induce_using_automatic=True
)
srv.fit(
X, y,
expert_rules=expert_rules,
expert_preferred_conditions=expert_preferred_conditions,
expert_forbidden_conditions=expert_forbidden_conditions
)
ruleset: RuleSet[SurvivalRule] = srv.model
[27]:
for rule in ruleset.rules:
print(rule)
IF [[CD34kgx10d6 = (-inf, 10)]] AND [[extcGvHD = {0}]] THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND PLTrecovery = <500142.50, inf) THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND Recipientage = <17.85, inf) AND RecipientRh = {1} THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND PLTrecovery = <26, inf) AND Recipientage = <14.30, inf) AND Relapse = {0} THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND PLTrecovery = <26, inf) AND Recipientage = <12, 18.85) AND Gendermatch = {0} AND Donorage = (-inf, 40.64) THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND CD3dCD34 = (-inf, 10.97) AND Donorage = (-inf, 49.19) AND PLTrecovery = (-inf, 500142.50) AND Txpostrelapse = {0} AND extcGvHD = {1} THEN
IF [CD34kgx10d6 = <11.86, inf)] AND Relapse = {0} THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND RecipientRh = {1} AND CD3dCD34 = <6.64, inf) THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND Recipientageint = {2} AND CD3dCD34 = <0.94, inf) AND Donorage = <36.03, inf) THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND PLTrecovery = <22.50, inf) THEN
IF [CD34kgx10d6 = <11.86, inf)] THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND CD3dCD34 = <0.89, inf) AND Rbodymass = <36.50, inf) AND Recipientage = <9.20, inf) AND IIIV = {1} AND PLTrecovery = (-inf, 22.50) AND Stemcellsource = {1} THEN