Expert Rules

This notebook presents example usage of user-guided rule induction which follows the scheme introduced by the GuideR algorithm (Sikora et al, 2019).
Each problem (classification, regression, survival) in addition to the basic class has an expert class, i.e. RuleClassifier and ExpertRuleClassifier. Expert classes allow you to define set of initial rules, preferred conditions and forbidden conditions.
This tutorial will show you how to define rules and conditions

Import RuleKit

[1]:
from rulekit import RuleKit
from rulekit.classification import RuleClassifier
from rulekit.params import Measures

Classification

Prepare dataset

[2]:
from scipy.io import arff
import pandas as pd


data_df = pd.DataFrame(arff.loadarff("seismic-bumps.arff")[0])
data_df['class'] = data_df['class'].astype(int)

X = data_df.drop(['class'], axis=1)
y = data_df['class']

Define rules and conditions

[3]:
expert_rules = [
    ('rule-0', 'IF [[gimpuls = <-inf, 750)]] THEN class = {0}'),
    ('rule-1', 'IF [[gimpuls = <750, inf)]] THEN class = {1}')
]

expert_preferred_conditions = [('preferred-condition-0', '1: IF [[seismic = {a}]] THEN class = {0}'), (
    'preferred-attribute-0', '1: IF [[gimpuls = Any]] THEN class = {1}')]

expert_forbidden_conditions = [('forb-attribute-0', '1: IF [[seismoacoustic  = Any]] THEN class = {0}'), (
    'forb-attribute-1', 'inf: IF [[ghazard  = Any]] THEN class = {1}')]

Rule induction

[4]:
from rulekit.classification import ExpertRuleClassifier

clf = ExpertRuleClassifier(
    minsupp_new=8,
    max_growing=0,
    extend_using_preferred=True,
    extend_using_automatic=True,
    induce_using_preferred=True,
    induce_using_automatic=True
)
clf.fit(
    X, y,
    expert_rules=expert_rules,
    expert_preferred_conditions=expert_preferred_conditions,
    expert_forbidden_conditions=expert_forbidden_conditions
)
ruleset = clf.model
[5]:
for rule in ruleset.rules:
    print(rule)
IF [seismic = {a}] AND gimpuls = (-inf, 521.50) AND genergy = (-inf, 32875) AND nbumps = (-inf, 0.50) THEN class = {0}
IF gimpuls = (-inf, 1252.50) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1342.50) AND goimpuls = (-inf, 312) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1427.50) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1653.50) AND genergy = (-inf, 1006585) AND goimpuls = (-inf, 312) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1752) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 2733) AND goimpuls = (-inf, 312) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = <2965, inf) AND genergy = <634250, inf) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1331) AND nbumps = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1655.50) AND genergy = (-inf, 386010) AND nbumps = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1686) AND goimpuls = (-inf, 312) AND nbumps5 = (-inf, 0.50) AND nbumps = (-inf, 2.50) AND nbumps2 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 2892) AND genergy = (-inf, 386010) AND goimpuls = (-inf, 312) AND nbumps = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 2068.50) AND goimpuls = (-inf, 312) AND genergy = (-inf, 1004565) AND nbumps = (-inf, 2.50) AND nbumps2 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 2184.50) AND nbumps = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps3 = (-inf, 1.50) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 901) AND goimpuls = (-inf, 96.50) AND senergy = (-inf, 3850) AND nbumps = (-inf, 3.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps3 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND goimpuls = (-inf, 312) AND senergy = (-inf, 9600) AND nbumps3 = (-inf, 2.50) AND nbumps2 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps3 = (-inf, 2.50) AND nbumps2 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND goimpuls = (-inf, 312) AND senergy = (-inf, 8100) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF ghazard = {a} AND goenergy = <-40.50, 68.50) AND maxenergy = (-inf, 5500) AND gimpuls = (-inf, 901) AND goimpuls = <-39.50, inf) AND senergy = <1150, inf) AND nbumps2 = <1.50, inf) THEN class = {0}
IF goenergy = <-48.50, inf) AND gimpuls = (-inf, 695.50) AND maxenergy = <2500, inf) AND goimpuls = <-54.50, inf) AND genergy = <10915, inf) AND nbumps3 = (-inf, 3.50) AND senergy = <3950, inf) AND nbumps2 = (-inf, 1.50) AND nbumps = (-inf, 6.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps = (-inf, 4.50) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1253.50) AND nbumps3 = (-inf, 2.50) AND nbumps = (-inf, 5.50) THEN class = {0}
IF maxenergy = (-inf, 75000) AND gimpuls = (-inf, 901) AND genergy = (-inf, 378500) AND nbumps3 = (-inf, 3.50) AND nbumps4 = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1139.50) AND goimpuls = (-inf, 312) AND senergy = (-inf, 85450) THEN class = {0}
IF gimpuls = <1150.50, inf) AND goimpuls = <-35.50, inf) AND nbumps3 = (-inf, 2.50) AND nbumps2 = (-inf, 0.50) AND nbumps = <1.50, inf) THEN class = {0}
IF goenergy = <-18.50, inf) AND gimpuls = <927, inf) AND genergy = (-inf, 508210) AND senergy = (-inf, 5750) AND nbumps2 = <1.50, inf) THEN class = {0}
IF senergy = (-inf, 5750) THEN class = {0}
IF gimpuls = (-inf, 2489.50) AND genergy = (-inf, 318735) AND nbumps3 = (-inf, 2.50) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF goenergy = <-36.50, inf) AND goimpuls = (-inf, 6.50) AND genergy = <392530, inf) AND senergy = <6750, inf) AND nbumps2 = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 3881.50) AND nbumps = (-inf, 4.50) AND nbumps2 = (-inf, 2.50) THEN class = {0}
IF [gimpuls = <1253.50, inf)] AND goenergy = <-40.50, 87) AND maxenergy = (-inf, 7500) AND genergy = <96260, 673155) AND seismic = {b} AND seismoacoustic = {a} AND senergy = (-inf, 10000) AND nbumps = (-inf, 3.50) THEN class = {1}
IF goenergy = (-inf, 96) AND maxenergy = <1500, inf) AND gimpuls = <605, 1959) AND goimpuls = <-55, 95) AND genergy = <61250, 662435) AND senergy = (-inf, 36050) AND nbumps3 = <0.50, inf) AND nbumps2 = <0.50, inf) AND nbumps = (-inf, 6.50) THEN class = {1}
IF goenergy = (-inf, 186) AND maxenergy = <1500, inf) AND gimpuls = <538.50, inf) AND genergy = <58310, 934630) AND goimpuls = <-55, inf) AND senergy = (-inf, 40650) AND nbumps2 = <0.50, inf) THEN class = {1}
IF gimpuls = <521.50, inf) AND genergy = <58310, inf) AND goimpuls = <-71, inf) AND senergy = <650, inf) AND nbumps = <1.50, inf) AND nbumps2 = <0.50, inf) THEN class = {1}
IF goenergy = (-inf, 97) AND gimpuls = <378, 2132) AND maxenergy = <2500, inf) AND genergy = <34880, 587745) AND goimpuls = (-inf, 95) AND senergy = <3150, 36050) AND nbumps3 = <0.50, inf) AND nbumps2 = <0.50, inf) AND nbumps = (-inf, 6.50) THEN class = {1}
IF goenergy = (-inf, 135.50) AND gimpuls = <306, inf) AND genergy = <19245, inf) AND senergy = <550, inf) AND nbumps = <1.50, inf) THEN class = {1}
IF goenergy = (-inf, -1.50) AND gimpuls = <153.50, 289) AND genergy = <17405, 37085) AND goimpuls = <-60.50, inf) AND senergy = (-inf, 40500) AND nbumps3 = (-inf, 3.50) AND nbumps = <1.50, inf) AND nbumps2 = <0.50, inf) THEN class = {1}
IF goenergy = (-inf, 131.50) AND gimpuls = <1253.50, inf) AND genergy = <54930, 1062020) AND goimpuls = <-60.50, 109) AND shift = {W} AND senergy = (-inf, 36050) AND nbumps2 = (-inf, 2.50) THEN class = {1}
IF gimpuls = <98.50, inf) AND senergy = <650, inf) AND nbumps2 = <0.50, inf) THEN class = {1}
IF goenergy = <-78.50, inf) AND gimpuls = <66, inf) AND goimpuls = <-74.50, inf) AND genergy = <3065, inf) AND senergy = <550, inf) THEN class = {1}
IF goenergy = (-inf, 176.50) AND gimpuls = <131, inf) AND genergy = <48545, inf) THEN class = {1}
IF goenergy = <-4, inf) AND gimpuls = <396, 1445.50) AND genergy = <32795, 49585) AND goimpuls = <-19, inf) AND shift = {W} AND senergy = (-inf, 350) THEN class = {1}
IF goenergy = <-37.50, inf) AND gimpuls = <537.50, 796) AND genergy = <16805, 32020) AND goimpuls = <-36.50, inf) AND senergy = (-inf, 250) THEN class = {1}
IF goenergy = <-37.50, 181) AND gimpuls = <240, 470.50) AND genergy = <19670, 40735) AND goimpuls = <-42.50, inf) AND shift = {W} THEN class = {1}
IF gimpuls = <54.50, inf) AND goimpuls = <-74.50, inf) AND genergy = <1510, inf) AND senergy = (-inf, 115450) THEN class = {1}

Regression

Prepare dataset

[6]:
from scipy.io import arff
import pandas as pd

data_df = pd.DataFrame(arff.loadarff("methane-train.arff")[0])

X = data_df.drop(['MM116_pred'], axis=1)
y = data_df['MM116_pred']
[7]:
X
[7]:
MM31 MM116 AS038 PG072 PD BA13 DMM116
0 0.46 1.3 2.4 2.0 1.0 1076.0 0.0
1 0.46 1.3 2.2 1.9 1.0 1076.0 0.0
2 0.49 1.3 2.2 1.9 1.0 1076.0 0.0
3 0.50 1.3 2.3 1.9 1.0 1076.0 0.0
4 0.54 1.3 2.3 1.9 1.0 1076.0 0.0
... ... ... ... ... ... ... ...
13363 0.64 1.2 2.4 1.8 1.0 1077.0 0.0
13364 0.59 1.2 2.4 1.8 1.0 1077.0 0.0
13365 0.60 1.1 2.2 1.8 1.0 1077.0 -0.1
13366 0.64 1.1 2.2 1.8 1.0 1077.0 0.0
13367 0.65 1.2 2.2 1.7 0.0 1077.0 0.1

13368 rows × 7 columns

Define rules and conditions

[8]:
expert_rules = None

expert_preferred_conditions = [
    (
        'preferred-condition-0',
        '3: IF PD = <0.5, inf) THEN MM116_pred = {NaN}'
    ),
    (
        'preferred-condition-1',
        '5: IF PD = <0.5, inf) AND MM116 = (-inf, 1.0) THEN MM116_pred = {NaN}'
    )
]

expert_forbidden_conditions = [
    ('forb-attribute-0', 'inf: IF DMM116 = Any THEN MM116_pred = {NaN}')
]

Rule induction

[11]:
from rulekit.regression import ExpertRuleRegressor

reg = ExpertRuleRegressor(
    minsupp_new=5,
    max_growing=0,
    mean_based_regression=True,
    extend_using_preferred=True,
    extend_using_automatic=False,
    induce_using_preferred=True,
    induce_using_automatic=True
)
reg.fit(
    X, y,
    expert_rules=expert_rules,
    expert_preferred_conditions=expert_preferred_conditions,
    expert_forbidden_conditions=expert_forbidden_conditions
)
ruleset = reg.model
[12]:
for rule in ruleset.rules:
    print(rule)
IF MM31 = (-inf, 0.23) THEN MM116_pred = {0.40} [0.39,0.41]
IF MM116 = (-inf, 0.45) AND MM31 = <0.18, 0.24) THEN MM116_pred = {0.40} [0.38,0.42]
IF MM31 = (-inf, 0.25) THEN MM116_pred = {0.44} [0.37,0.51]
IF MM31 = (-inf, 0.26) THEN MM116_pred = {0.46} [0.36,0.55]
IF MM31 = (-inf, 0.28) THEN MM116_pred = {0.49} [0.37,0.61]
IF PD = (-inf, 0.50) AND MM116 = <0.25, inf) AND AS038 = <2, 2.45) AND MM31 = <0.23, inf) AND PG072 = (-inf, 1.95) AND BA13 = (-inf, 1075.50) THEN MM116_pred = {0.71} [0.50,0.93]
IF PD = (-inf, 0.50) AND MM116 = (-inf, 0.25) AND AS038 = <2.35, 2.45) AND MM31 = <0.19, inf) AND PG072 = <1.75, 1.95) AND BA13 = (-inf, 1075.50) THEN MM116_pred = {0.25} [0.20,0.30]
IF MM116 = (-inf, 0.45) AND MM31 = <0.18, inf) AND BA13 = (-inf, 1077.50) THEN MM116_pred = {0.40} [0.37,0.43]
IF MM116 = (-inf, 0.55) AND MM31 = (-inf, 0.32) THEN MM116_pred = {0.45} [0.39,0.51]
IF MM116 = (-inf, 0.55) THEN MM116_pred = {0.45} [0.39,0.52]
IF MM116 = <0.45, 0.65) THEN MM116_pred = {0.55} [0.49,0.61]
IF MM116 = <0.45, 0.75) THEN MM116_pred = {0.60} [0.49,0.71]
IF MM116 = <0.45, 0.85) AND MM31 = <0.25, inf) THEN MM116_pred = {0.70} [0.56,0.84]
IF MM116 = <0.70, inf) THEN MM116_pred = {0.97} [0.77,1.17]

Survival

Prepare dataset

[14]:
from scipy.io import arff
import pandas as pd

data_df = pd.DataFrame(arff.loadarff(open('bmt.arff', 'r', encoding="cp1252"))[0])

# code to fix the problem with encoding of the file
tmp_df = data_df.select_dtypes([object])
tmp_df = tmp_df.stack().str.decode("cp1252").unstack()
for col in tmp_df:
    data_df[col] = tmp_df[col]

data_df = data_df.replace({'?': None})

X = data_df.drop(['survival_status'], axis=1)
y = data_df['survival_status']

Define rules and conditions

[15]:
expert_rules = [
    (
        'rule-0',
        'IF [[CD34kgx10d6 = (-inf, 10.0)]] AND [[extcGvHD = {0}]] THEN survival_status = {NaN}')
]

expert_preferred_conditions = [
    (
        'attr-preferred-0',
        'inf: IF [CD34kgx10d6 = Any] THEN survival_status = {NaN}'
    )
]


expert_forbidden_conditions = [
    ('attr-forbidden-0', 'IF [ANCrecovery = Any] THEN survival_status = {NaN}')
]

Rule induction

[16]:
from rulekit.survival import ExpertSurvivalRules

srv = ExpertSurvivalRules(
    survival_time_attr='survival_time',
    minsupp_new=5,
    max_growing=0,
    extend_using_preferred=False,
    extend_using_automatic=False,
    induce_using_preferred=True,
    induce_using_automatic=True
)
srv.fit(
    X, y,
    expert_rules=expert_rules,
    expert_preferred_conditions=expert_preferred_conditions,
    expert_forbidden_conditions=expert_forbidden_conditions
)
ruleset = srv.model
[17]:
for rule in ruleset.rules:
    print(rule)
IF [[CD34kgx10d6 = (-inf, 10)]] AND [[extcGvHD = {0}]] THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND PLTrecovery = <500142.50, inf) THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND RecipientRh = {1} AND Recipientage = <17.85, inf) THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND Relapse = {0} AND PLTrecovery = <26, inf) AND Recipientage = <14.30, inf) THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND Donorage = (-inf, 40.64) AND Gendermatch = {0} AND PLTrecovery = <26, inf) AND Recipientage = <12, 18.85) THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND Donorage = (-inf, 49.19) AND extcGvHD = {1} AND PLTrecovery = (-inf, 500142.50) AND Txpostrelapse = {0} AND CD3dCD34 = (-inf, 10.97) THEN
IF [CD34kgx10d6 = <11.86, inf)] AND Relapse = {0} THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND RecipientRh = {1} AND CD3dCD34 = <6.64, inf) THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND Donorage = <36.03, inf) AND Recipientageint = {2} AND CD3dCD34 = <0.94, inf) THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND PLTrecovery = <22.50, inf) THEN
IF [CD34kgx10d6 = <11.86, inf)] THEN
IF [CD34kgx10d6 = (-inf, 11.86)] AND Stemcellsource = {1} AND PLTrecovery = (-inf, 22.50) AND CD3dCD34 = <0.89, inf) AND Rbodymass = <36.50, inf) AND Recipientage = <9.20, inf) AND IIIV = {1} THEN