Expert Rules¶
This notebook presents example usage of user-guided rule induction which follows the scheme introduced by the GuideR algorithm (Sikora et al, 2019).
Each problem (classification, regression, survival) in addition to the basic class has an expert class, i.e. RuleClassifier and ExpertRuleClassifier. Expert classes allow you to define set of initial rules, preferred conditions and forbidden conditions.
This tutorial will show you how to define rules and conditions
Import and init RuleKit¶
[ ]:
from rulekit import RuleKit
from rulekit.classification import RuleClassifier
from rulekit.params import Measures
RuleKit.init()
Classification¶
Prepare dataset¶
[24]:
from scipy.io import arff
import pandas as pd
datasets_path = ""
file_name = "seismic-bumps.arff"
data_df = pd.DataFrame(arff.loadarff(datasets_path + file_name)[0])
data_df['class'] = data_df['class'].astype(int)
x = data_df.drop(['class'], axis=1)
y = data_df['class']
Define rules and conditions¶
[25]:
expert_rules = [('rule-0', 'IF [[gimpuls = (-inf, 750)]] THEN class = {0}'), ('rule-1', 'IF [[gimpuls = <750, inf)]] THEN class = {1}')]
expert_preferred_conditions = [('preferred-condition-0', '1: IF [[seismic = {a}]] THEN class = {0}'), ('preferred-attribute-0', '1: IF [[gimpuls = Any]] THEN class = {1}')]
expert_forbidden_conditions = [('forb-attribute-0', '1: IF [[seismoacoustic = Any]] THEN class = {0}'), ('forb-attribute-1', 'inf: IF [[ghazard = Any]] THEN class = {1}')]
Rule induction¶
[26]:
from rulekit.classification import ExpertRuleClassifier
clf = ExpertRuleClassifier(
min_rule_covered = 8,
max_growing= 0,
extend_using_preferred=True,
extend_using_automatic=True,
induce_using_preferred=True,
induce_using_automatic=True
)
clf.fit(values = x, labels = y, expert_rules = expert_rules, expert_preferred_conditions = expert_preferred_conditions, expert_forbidden_conditions= expert_forbidden_conditions)
ruleset = clf.model
[27]:
for rule in ruleset.rules:
print(rule)
IF [[gimpuls = (-inf, 750)]] AND [seismic = {a}] AND nbumps4 = (-inf, 0.50) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1252.50) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1342.50) AND goimpuls = (-inf, 312) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1427.50) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1653.50) AND genergy = (-inf, 1006585) AND goimpuls = (-inf, 312) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1752) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 2733) AND goimpuls = (-inf, 312) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = <2965, inf) AND genergy = <634250, inf) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1331) AND nbumps = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1655.50) AND genergy = (-inf, 386010) AND nbumps = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1686) AND goimpuls = (-inf, 312) AND nbumps5 = (-inf, 0.50) AND nbumps = (-inf, 2.50) AND nbumps2 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 2892) AND genergy = (-inf, 386010) AND goimpuls = (-inf, 312) AND nbumps = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 2068.50) AND goimpuls = (-inf, 312) AND genergy = (-inf, 1004565) AND nbumps = (-inf, 2.50) AND nbumps2 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 2184.50) AND nbumps = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps3 = (-inf, 1.50) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 901) AND goimpuls = (-inf, 96.50) AND senergy = (-inf, 3850) AND nbumps = (-inf, 3.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps3 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND goimpuls = (-inf, 312) AND senergy = (-inf, 9600) AND nbumps3 = (-inf, 2.50) AND nbumps2 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps3 = (-inf, 2.50) AND nbumps2 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND goimpuls = (-inf, 312) AND senergy = (-inf, 8100) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF ghazard = {a} AND goenergy = <-40.50, 68.50) AND maxenergy = (-inf, 5500) AND gimpuls = (-inf, 901) AND goimpuls = <-39.50, inf) AND senergy = <1150, inf) AND nbumps2 = <1.50, inf) THEN class = {0}
IF goenergy = <-48.50, inf) AND gimpuls = (-inf, 695.50) AND maxenergy = <2500, inf) AND goimpuls = <-54.50, inf) AND genergy = <10915, inf) AND nbumps3 = (-inf, 3.50) AND senergy = <3950, inf) AND nbumps2 = (-inf, 1.50) AND nbumps = (-inf, 6.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps = (-inf, 4.50) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps3 = (-inf, 2.50) AND nbumps = (-inf, 5.50) THEN class = {0}
IF maxenergy = (-inf, 75000) AND gimpuls = (-inf, 901) AND genergy = (-inf, 378500) AND nbumps3 = (-inf, 3.50) AND nbumps4 = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1139.50) AND goimpuls = (-inf, 312) AND senergy = (-inf, 85450) THEN class = {0}
IF gimpuls = <1150.50, inf) AND goimpuls = <-35.50, inf) AND nbumps3 = (-inf, 2.50) AND nbumps2 = (-inf, 0.50) AND nbumps = <1.50, inf) THEN class = {0}
IF goenergy = <-18.50, inf) AND gimpuls = <927, inf) AND genergy = (-inf, 508210) AND senergy = (-inf, 5750) AND nbumps2 = <1.50, inf) THEN class = {0}
IF senergy = (-inf, 5750) THEN class = {0}
IF gimpuls = (-inf, 2489.50) AND genergy = (-inf, 318735) AND nbumps3 = (-inf, 2.50) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF goenergy = <-36.50, inf) AND goimpuls = (-inf, 6.50) AND genergy = <392530, inf) AND senergy = <6750, inf) AND nbumps2 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 3881.50) AND nbumps = (-inf, 4.50) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF [gimpuls = <1253.50, inf)] AND goenergy = <-40.50, 87) AND maxenergy = (-inf, 7500) AND genergy = <96260, 673155) AND seismic = {b} AND seismoacoustic = {a} AND senergy = (-inf, 10000) AND nbumps = (-inf, 3.50) THEN class = {1}
IF goenergy = (-inf, 96) AND maxenergy = <1500, inf) AND gimpuls = <605, 1959) AND goimpuls = <-55, 95) AND genergy = <61250, 662435) AND senergy = (-inf, 36050) AND nbumps3 = <0.50, inf) AND nbumps2 = <0.50, inf) AND nbumps = (-inf, 6.50) THEN class = {1}
IF goenergy = (-inf, 186) AND maxenergy = <1500, inf) AND gimpuls = <538.50, inf) AND genergy = <58310, 934630) AND goimpuls = <-55, inf) AND senergy = (-inf, 40650) AND nbumps2 = <0.50, inf) THEN class = {1}
IF gimpuls = <521.50, inf) AND genergy = <58310, inf) AND goimpuls = <-71, inf) AND senergy = <650, inf) AND nbumps = <1.50, inf) AND nbumps2 = <0.50, inf) THEN class = {1}
IF goenergy = (-inf, 97) AND gimpuls = <378, 2132) AND maxenergy = <2500, inf) AND genergy = <34880, 587745) AND goimpuls = (-inf, 95) AND senergy = <3150, 36050) AND nbumps3 = <0.50, inf) AND nbumps2 = <0.50, inf) AND nbumps = (-inf, 6.50) THEN class = {1}
IF goenergy = (-inf, 135.50) AND gimpuls = <306, inf) AND genergy = <19245, inf) AND senergy = <550, inf) AND nbumps = <1.50, inf) THEN class = {1}
IF goenergy = (-inf, -1.50) AND gimpuls = <153.50, 289) AND genergy = <17405, 37085) AND goimpuls = <-60.50, inf) AND senergy = (-inf, 40500) AND nbumps3 = (-inf, 3.50) AND nbumps = <1.50, inf) AND nbumps2 = <0.50, inf) THEN class = {1}
IF goenergy = (-inf, 131.50) AND gimpuls = <1253.50, inf) AND genergy = <54930, 1062020) AND goimpuls = <-60.50, 109) AND shift = {W} AND senergy = (-inf, 36050) AND nbumps2 = (-inf, 2.50) THEN class = {1}
IF gimpuls = <98.50, inf) AND senergy = <650, inf) AND nbumps2 = <0.50, inf) THEN class = {1}
IF goenergy = <-78.50, inf) AND gimpuls = <66, inf) AND goimpuls = <-74.50, inf) AND genergy = <3065, inf) AND senergy = <550, inf) THEN class = {1}
IF goenergy = (-inf, 176.50) AND gimpuls = <131, inf) AND genergy = <48545, inf) THEN class = {1}
IF goenergy = <-4, inf) AND gimpuls = <396, 1445.50) AND genergy = <32795, 49585) AND goimpuls = <-19, inf) AND shift = {W} AND senergy = (-inf, 350) THEN class = {1}
IF goenergy = <-37.50, inf) AND gimpuls = <537.50, 796) AND genergy = <16805, 32020) AND goimpuls = <-36.50, inf) AND senergy = (-inf, 250) THEN class = {1}
IF goenergy = <-37.50, 181) AND gimpuls = <240, 470.50) AND genergy = <19670, 40735) AND goimpuls = <-42.50, inf) AND shift = {W} THEN class = {1}
IF gimpuls = <54.50, inf) AND goimpuls = <-74.50, inf) AND genergy = <1510, inf) AND senergy = (-inf, 115450) THEN class = {1}
Regression¶
Prepare dataset¶
[10]:
from scipy.io import arff
import pandas as pd
datasets_path = ""
file_name = "methane-train.arff"
data_df = pd.DataFrame(arff.loadarff(datasets_path + file_name)[0])
x = data_df.drop(['MM116_pred'], axis=1)
y = data_df['MM116_pred']
Define rules and conditions¶
[11]:
expert_rules = None
expert_preferred_conditions = [('preferred-condition-0', '3: IF PD = <0.5, inf) THEN MM116_pred = {NaN}'), ('preferred-condition-1', '5: IF PD = <0.5, inf) AND MM116 = (-inf, 1.0) THEN MM116_pred = {NaN}')]
expert_forbidden_conditions = [('forb-attribute-0', 'inf: IF DMM116 = Any THEN MM116_pred = {NaN}')]
Rule induction¶
[12]:
from rulekit.regression import ExpertRuleRegressor
reg = ExpertRuleRegressor(
min_rule_covered = 5,
max_growing= 0,
extend_using_preferred=True,
extend_using_automatic=False,
induce_using_preferred=True,
induce_using_automatic=True
)
reg.fit(values = x, labels = y, expert_rules = expert_rules, expert_preferred_conditions = expert_preferred_conditions, expert_forbidden_conditions= expert_forbidden_conditions)
ruleset = reg.model
[13]:
for rule in ruleset.rules:
print(rule)
IF MM116 = (-inf, 0.45) AND MM31 = <0.18, 0.24) THEN MM116_pred = {0.40} [0.38,0.42]
IF MM31 = (-inf, 0.25) THEN MM116_pred = {0.40} [0.33,0.47]
IF AS038 = (-inf, 2.45) AND MM31 = (-inf, 0.26) THEN MM116_pred = {0.40} [0.31,0.49]
IF MM31 = <0.18, 0.28) THEN MM116_pred = {0.50} [0.38,0.62]
IF MM116 = <0.25, 0.45) AND MM31 = <0.18, inf) AND BA13 = (-inf, 1077.50) THEN MM116_pred = {0.40} [0.38,0.42]
IF MM116 = (-inf, 0.25) THEN MM116_pred = {0.20} [0.15,0.25]
IF MM116 = <0.45, 0.55) AND PG072 = <1.65, inf) THEN MM116_pred = {0.50} [0.47,0.53]
IF MM116 = <0.55, 0.65) AND MM31 = <0.24, inf) THEN MM116_pred = {0.60} [0.56,0.64]
IF MM116 = <0.45, 0.75) AND AS038 = (-inf, 2.45) AND MM31 = (-inf, 0.29) AND BA13 = <1072.50, 1077.50) THEN MM116_pred = {0.50} [0.47,0.53]
IF PD = (-inf, 0.50) AND MM116 = <0.45, 0.85) AND AS038 = (-inf, 2.45) AND MM31 = (-inf, 0.29) AND BA13 = <1072.50, 1077.50) THEN MM116_pred = {0.50} [0.44,0.56]
IF MM116 = (-inf, 0.75) THEN MM116_pred = {0.50} [0.37,0.63]
IF MM116 = <0.45, 0.85) THEN MM116_pred = {0.70} [0.55,0.85]
IF MM116 = <0.45, inf) AND MM31 = <0.30, inf) THEN MM116_pred = {0.90} [0.68,1.12]
IF MM116 = <0.70, inf) THEN MM116_pred = {0.90} [0.70,1.10]
Survival¶
Prepare dataset¶
[30]:
from scipy.io import arff
import pandas as pd
datasets_path = ""
file_name = 'bmt.arff'
data_df = pd.DataFrame(arff.loadarff(open(datasets_path + file_name, 'r', encoding="cp1252"))[0])
# code to fix the problem with encoding of the file
tmp_df = data_df.select_dtypes([object])
tmp_df = tmp_df.stack().str.decode("cp1252").unstack()
for col in tmp_df:
data_df[col] = tmp_df[col]
data_df = data_df.replace({'?': None})
x = data_df.drop(['survival_status'], axis=1)
y = data_df['survival_status']
Define rules and conditions¶
[31]:
expert_rules = [('rule-0', 'IF [[CD34kgx10d6 = (-inf, 10.0)]] AND [[extcGvHD = {0}]] THEN survival_status = {NaN}')]
expert_preferred_conditions = [('attr-preferred-0', 'inf: IF [CD34kgx10d6 = Any] THEN survival_status = {NaN}')]
expert_forbidden_conditions = [('attr-forbidden-0', 'IF [ANCrecovery = Any] THEN survival_status = {NaN}')]
Rule induction¶
[33]:
from rulekit.survival import ExpertSurvivalRules
srv = ExpertSurvivalRules(
survival_time_attr = 'survival_time',
min_rule_covered = 5,
max_growing= 0,
extend_using_preferred=False,
extend_using_automatic=False,
induce_using_preferred=True,
induce_using_automatic=True
)
srv.fit(values = x, labels = y, expert_rules = expert_rules, expert_preferred_conditions = expert_preferred_conditions, expert_forbidden_conditions= expert_forbidden_conditions)
ruleset = srv.model
[34]:
for rule in ruleset.rules:
print(rule)
IF [[CD34kgx10d6 = (-inf, 10)]] AND [[extcGvHD = {0}]] THEN survival_status = {NaN}
IF [CD34kgx10d6 = (-inf, 11.86)] AND PLTrecovery = <500142.50, inf) THEN survival_status = {NaN}
IF [CD34kgx10d6 = (-inf, 11.86)] AND RecipientRh = {1} AND Recipientage = <17.85, inf) THEN survival_status = {NaN}
IF [CD34kgx10d6 = (-inf, 11.86)] AND Relapse = {0} AND PLTrecovery = <26, inf) AND Recipientage = <14.30, inf) THEN survival_status = {NaN}
IF [CD34kgx10d6 = (-inf, 11.86)] AND Donorage = (-inf, 40.64) AND Gendermatch = {0} AND PLTrecovery = <26, inf) AND Recipientage = <12, 18.85) THEN survival_status = {NaN}
IF [CD34kgx10d6 = (-inf, 11.86)] AND RecipientRh = {1} AND CD3dCD34 = <6.64, inf) THEN survival_status = {NaN}
IF [CD34kgx10d6 = <11.86, inf)] AND Relapse = {0} THEN survival_status = {NaN}
IF [CD34kgx10d6 = (-inf, 11.86)] AND Relapse = {0} AND extcGvHD = {1} THEN survival_status = {NaN}
IF [CD34kgx10d6 = (-inf, 11.86)] AND Donorage35 = {1} AND Rbodymass = <37.65, inf) THEN survival_status = {NaN}
IF [CD34kgx10d6 = (-inf, 11.86)] AND PLTrecovery = <19.50, inf) AND Donorage35 = {0} AND CD3dkgx10d8 = <0.92, inf) AND CD3dCD34 = <0.97, inf) AND Rbodymass = (-inf, 43.50) AND Recipientage = (-inf, 12.95) THEN survival_status = {NaN}
IF [CD34kgx10d6 = <11.86, inf)] THEN survival_status = {NaN}