{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Raport_przezyciowy.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true }, "kernelspec": { "name": "rulekit", "display_name": "rulekit", "language": "python" }, "language_info": { "name": "python", "version": "3.8.6" }, "metadata": { "interpreter": { "hash": "62266c16fff41e971c13e9cb2ad3d47e4ef45d0678714c255381eb9fdcbd7032" } } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "-Uy-yBGsd9W1" }, "source": [ "# Survival analysis" ] }, { "source": [ "This notebook presents example usage of package for solving survival problem on `bmt` dataset. You can download dataset [here](https://raw.githubusercontent.com/adaa-polsl/RuleKit/master/data/bmt/bmt.arff) \n", "\n", "This tutorial will cover topics such as: \n", "- training model \n", "- changing model hyperparameters \n", "- hyperparameters tuning \n", "- calculating metrics for model \n", "- getting RuleKit inbuilt " ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "markdown", "metadata": { "id": "KjtU7PA8eOTr" }, "source": [ "## Summary of the dataset" ] }, { "cell_type": "code", "metadata": { "id": "Tp1TpfCkd58n" }, "source": [ "from scipy.io import arff\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "datasets_path = \"\" \n", "\n", "file_name = 'bmt.arff'\n", "\n", "data_df = pd.DataFrame(arff.loadarff(open(datasets_path + file_name, 'r', encoding=\"cp1252\"))[0])\n", "\n", "# code to fix the problem with encoding of the file\n", "tmp_df = data_df.select_dtypes([object]) \n", "tmp_df = tmp_df.stack().str.decode(\"cp1252\").unstack()\n", "for col in tmp_df:\n", " data_df[col] = tmp_df[col]\n", " \n", "data_df = data_df.replace({'?': None})" ], "execution_count": 1, "outputs": [] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Recipientgender Stemcellsource Donorage Donorage35 IIIV Gendermatch \\\n", "0 1 1 22.830137 0 1 0 \n", "1 1 0 23.342466 0 1 0 \n", "2 1 0 26.394521 0 1 0 \n", "3 0 0 39.684932 1 1 0 \n", "4 0 1 33.358904 0 0 0 \n", ".. ... ... ... ... ... ... \n", "182 1 1 37.575342 1 1 0 \n", "183 0 1 22.895890 0 0 0 \n", "184 0 1 27.347945 0 1 0 \n", "185 1 1 27.780822 0 1 0 \n", "186 1 1 55.553425 1 1 0 \n", "\n", " DonorABO RecipientABO RecipientRh ABOmatch ... extcGvHD CD34kgx10d6 \\\n", "0 1 1 1 0 ... 1 7.20 \n", "1 -1 -1 1 0 ... 1 4.50 \n", "2 -1 -1 1 0 ... 1 7.94 \n", "3 1 2 1 1 ... None 4.25 \n", "4 1 2 0 1 ... 1 51.85 \n", ".. ... ... ... ... ... ... ... \n", "182 1 1 0 0 ... 1 11.08 \n", "183 1 0 1 1 ... 1 4.64 \n", "184 1 -1 1 1 ... 1 7.73 \n", "185 1 0 1 1 ... 0 15.41 \n", "186 1 2 1 1 ... 1 9.91 \n", "\n", " CD3dCD34 CD3dkgx10d8 Rbodymass ANCrecovery PLTrecovery \\\n", "0 1.338760 5.38 35.0 19.0 51.0 \n", "1 11.078295 0.41 20.6 16.0 37.0 \n", "2 19.013230 0.42 23.4 23.0 20.0 \n", "3 29.481647 0.14 50.0 23.0 29.0 \n", "4 3.972255 13.05 9.0 14.0 14.0 \n", ".. ... ... ... ... ... \n", "182 2.522750 4.39 44.0 15.0 22.0 \n", "183 1.038858 4.47 44.5 12.0 30.0 \n", "184 1.635559 4.73 33.0 16.0 16.0 \n", "185 8.077770 1.91 24.0 13.0 14.0 \n", "186 0.948135 10.45 37.0 18.0 20.0 \n", "\n", " time_to_aGvHD_III_IV survival_time survival_status \n", "0 32.0 999.0 0.0 \n", "1 1000000.0 163.0 1.0 \n", "2 1000000.0 435.0 1.0 \n", "3 19.0 53.0 1.0 \n", "4 1000000.0 2043.0 0.0 \n", ".. ... ... ... \n", "182 16.0 385.0 1.0 \n", "183 1000000.0 634.0 1.0 \n", "184 1000000.0 1895.0 0.0 \n", "185 54.0 382.0 1.0 \n", "186 1000000.0 1109.0 0.0 \n", "\n", "[187 rows x 37 columns]" ], "text/html": "
| \n | Recipientgender | \nStemcellsource | \nDonorage | \nDonorage35 | \nIIIV | \nGendermatch | \nDonorABO | \nRecipientABO | \nRecipientRh | \nABOmatch | \n... | \nextcGvHD | \nCD34kgx10d6 | \nCD3dCD34 | \nCD3dkgx10d8 | \nRbodymass | \nANCrecovery | \nPLTrecovery | \ntime_to_aGvHD_III_IV | \nsurvival_time | \nsurvival_status | \n
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n1 | \n1 | \n22.830137 | \n0 | \n1 | \n0 | \n1 | \n1 | \n1 | \n0 | \n... | \n1 | \n7.20 | \n1.338760 | \n5.38 | \n35.0 | \n19.0 | \n51.0 | \n32.0 | \n999.0 | \n0.0 | \n
| 1 | \n1 | \n0 | \n23.342466 | \n0 | \n1 | \n0 | \n-1 | \n-1 | \n1 | \n0 | \n... | \n1 | \n4.50 | \n11.078295 | \n0.41 | \n20.6 | \n16.0 | \n37.0 | \n1000000.0 | \n163.0 | \n1.0 | \n
| 2 | \n1 | \n0 | \n26.394521 | \n0 | \n1 | \n0 | \n-1 | \n-1 | \n1 | \n0 | \n... | \n1 | \n7.94 | \n19.013230 | \n0.42 | \n23.4 | \n23.0 | \n20.0 | \n1000000.0 | \n435.0 | \n1.0 | \n
| 3 | \n0 | \n0 | \n39.684932 | \n1 | \n1 | \n0 | \n1 | \n2 | \n1 | \n1 | \n... | \nNone | \n4.25 | \n29.481647 | \n0.14 | \n50.0 | \n23.0 | \n29.0 | \n19.0 | \n53.0 | \n1.0 | \n
| 4 | \n0 | \n1 | \n33.358904 | \n0 | \n0 | \n0 | \n1 | \n2 | \n0 | \n1 | \n... | \n1 | \n51.85 | \n3.972255 | \n13.05 | \n9.0 | \n14.0 | \n14.0 | \n1000000.0 | \n2043.0 | \n0.0 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 182 | \n1 | \n1 | \n37.575342 | \n1 | \n1 | \n0 | \n1 | \n1 | \n0 | \n0 | \n... | \n1 | \n11.08 | \n2.522750 | \n4.39 | \n44.0 | \n15.0 | \n22.0 | \n16.0 | \n385.0 | \n1.0 | \n
| 183 | \n0 | \n1 | \n22.895890 | \n0 | \n0 | \n0 | \n1 | \n0 | \n1 | \n1 | \n... | \n1 | \n4.64 | \n1.038858 | \n4.47 | \n44.5 | \n12.0 | \n30.0 | \n1000000.0 | \n634.0 | \n1.0 | \n
| 184 | \n0 | \n1 | \n27.347945 | \n0 | \n1 | \n0 | \n1 | \n-1 | \n1 | \n1 | \n... | \n1 | \n7.73 | \n1.635559 | \n4.73 | \n33.0 | \n16.0 | \n16.0 | \n1000000.0 | \n1895.0 | \n0.0 | \n
| 185 | \n1 | \n1 | \n27.780822 | \n0 | \n1 | \n0 | \n1 | \n0 | \n1 | \n1 | \n... | \n0 | \n15.41 | \n8.077770 | \n1.91 | \n24.0 | \n13.0 | \n14.0 | \n54.0 | \n382.0 | \n1.0 | \n
| 186 | \n1 | \n1 | \n55.553425 | \n1 | \n1 | \n0 | \n1 | \n2 | \n1 | \n1 | \n... | \n1 | \n9.91 | \n0.948135 | \n10.45 | \n37.0 | \n18.0 | \n20.0 | \n1000000.0 | \n1109.0 | \n0.0 | \n
187 rows × 37 columns
\n| \n | Donorage | \nRecipientage | \nCD34kgx10d6 | \nCD3dCD34 | \nCD3dkgx10d8 | \nRbodymass | \nANCrecovery | \nPLTrecovery | \ntime_to_aGvHD_III_IV | \nsurvival_time | \nsurvival_status | \n
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | \n187.000000 | \n187.000000 | \n187.000000 | \n182.000000 | \n182.000000 | \n185.000000 | \n187.000000 | \n187.000000 | \n187.000000 | \n187.000000 | \n187.000000 | \n
| mean | \n33.472068 | \n9.931551 | \n11.891781 | \n5.385096 | \n4.745714 | \n35.801081 | \n26752.866310 | \n90937.919786 | \n775408.042781 | \n938.743316 | \n0.454545 | \n
| std | \n8.271826 | \n5.305639 | \n9.914386 | \n9.598716 | \n3.859128 | \n19.650922 | \n161747.200525 | \n288242.407688 | \n418425.252689 | \n849.589495 | \n0.499266 | \n
| min | \n18.646575 | \n0.600000 | \n0.790000 | \n0.204132 | \n0.040000 | \n6.000000 | \n9.000000 | \n9.000000 | \n10.000000 | \n6.000000 | \n0.000000 | \n
| 25% | \n27.039726 | \n5.050000 | \n5.350000 | \n1.786683 | \n1.687500 | \n19.000000 | \n13.000000 | \n16.000000 | \n1000000.000000 | \n168.500000 | \n0.000000 | \n
| 50% | \n33.550685 | \n9.600000 | \n9.720000 | \n2.734462 | \n4.325000 | \n33.000000 | \n15.000000 | \n21.000000 | \n1000000.000000 | \n676.000000 | \n0.000000 | \n
| 75% | \n40.117809 | \n14.050000 | \n15.415000 | \n5.823565 | \n6.785000 | \n50.600000 | \n17.000000 | \n37.000000 | \n1000000.000000 | \n1604.000000 | \n1.000000 | \n
| max | \n55.553425 | \n20.200000 | \n57.780000 | \n99.560970 | \n20.020000 | \n103.400000 | \n1000000.000000 | \n1000000.000000 | \n1000000.000000 | \n3364.000000 | \n1.000000 | \n