import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import time
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score,\
precision_score, recall_score,f1_score,make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import  DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics
import warnings
warnings.filterwarnings('ignore')


                        
                            # Loading the training dataset
df_train = pd.read_csv('diabetic_data_train.csv')

# Loading the test dataset
df_test = pd.read_csv('diabetic_data_test.csv')

print("Unique encounter ids in test set: ",len(df_test['encounter_id'].unique()))
print("Unique encounter ids in training set: ",len(df_train['encounter_id'].unique()))

# Storing the encounter ids for both the datasets in two separate arrays
test_ids = df_test['encounter_id'].unique()
train_ids = df_train['encounter_id'].unique()


# Merging both the datasets for preprocessing into a single dataframe
df = pd.concat([df_train,df_test], axis = 0)
df.head()

Unique encounter ids in test set:  25442
Unique encounter ids in training set:  76324


                        
                            print("Total number of columns in training dataset --> ",len(df_train.columns))
print("Total number of rows in training dataset    -->",len(df_train))
print("\nTotal number of columns in test dataset --> ",len(df_test.columns))
print("Total number of rows in test dataset    -->",len(df_test))

Total number of columns in training dataset -->  51
Total number of rows in training dataset    --> 76324

Total number of columns in test dataset -->  51
Total number of rows in test dataset    --> 25442


                        
                            for i in df.columns:
    print(i,"--> ",len(df[i].unique()),"unique values\n",df[i].unique(),"\n")

Unnamed: 0 -->  101766 unique values
 [93634 27698 17488 ... 46177 11421 52819] 

encounter_id -->  101766 unique values
 [338369606  90739116  64286964 ... 142561326  47282550 155779584] 

patient_nbr -->  71518 unique values
 [159605546  83232054  95117913 ...  17214336  30036699   8819190] 

race -->  6 unique values
 ['AfricanAmerican' 'Caucasian' 'Hispanic' 'Other' '?' 'Asian'] 

gender -->  3 unique values
 ['Male' 'Female' 'Unknown/Invalid'] 

age -->  10 unique values
 ['[70-80)' '[60-70)' '[40-50)' '[80-90)' '[90-100)' '[10-20)' '[50-60)'
 '[20-30)' '[30-40)' '[0-10)'] 

weight -->  10 unique values
 ['?' '[75-100)' '[100-125)' '[50-75)' '[125-150)' '[175-200)' '[150-175)'
 '[25-50)' '[0-25)' '>200'] 

admission_type_id -->  8 unique values
 [1 3 6 2 5 8 4 7] 

discharge_disposition_id -->  26 unique values
 [23  1  6 18  2  3  5  7 11 25  4 22 15 14 13  8 12 19 28 16 24 10 17 27
  9 20] 

admission_source_id -->  17 unique values
 [ 7  1  6  4 17  5  2 22  3 20 14  9 10 11  8 13 25] 

time_in_hospital -->  14 unique values
 [ 3  1  4  2 10  5 14  8 12  6  7 11  9 13] 

payer_code -->  18 unique values
 ['MC' '?' 'BC' 'HM' 'UN' 'CM' 'SP' 'OG' 'MD' 'CP' 'OT' 'CH' 'PO' 'MP' 'DM'
 'WC' 'SI' 'FR'] 

medical_specialty -->  73 unique values
 ['?' 'Cardiology' 'Family/GeneralPractice' 'Surgery-Vascular'
 'Surgery-Cardiovascular/Thoracic' 'Pediatrics-Pulmonology'
 'SurgicalSpecialty' 'Emergency/Trauma' 'Orthopedics-Reconstructive'
 'InternalMedicine' 'Hospitalist' 'Otolaryngology' 'Gastroenterology'
 'Nephrology' 'Surgery-General' 'Surgery-Thoracic'
 'ObstetricsandGynecology' 'Pediatrics'
 'PhysicalMedicineandRehabilitation' 'Pulmonology' 'Psychology'
 'Hematology/Oncology' 'Endocrinology' 'Radiologist' 'Psychiatry'
 'Orthopedics' 'Surgery-Neuro' 'Oncology' 'Urology' 'Neurology'
 'Pediatrics-CriticalCare' 'Surgeon' 'Ophthalmology'
 'Surgery-Cardiovascular' 'Pediatrics-Endocrinology' 'Osteopath'
 'Radiology' 'Pathology' 'Podiatry' 'Gynecology' 'Hematology'
 'Surgery-Plastic' 'Rheumatology' 'Obstetrics' 'InfectiousDiseases'
 'Pediatrics-EmergencyMedicine' 'Endocrinology-Metabolism'
 'Pediatrics-AllergyandImmunology' 'Surgery-Pediatric'
 'Obsterics&Gynecology-GynecologicOnco' 'Surgery-Colon&Rectal'
 'Pediatrics-Hematology-Oncology' 'Resident' 'Surgery-Maxillofacial'
 'OutreachServices' 'PhysicianNotFound' 'Cardiology-Pediatric'
 'Anesthesiology-Pediatric' 'Anesthesiology' 'DCPTEAM' 'Dentistry'
 'Psychiatry-Child/Adolescent' 'Speech' 'Pediatrics-Neurology'
 'Proctology' 'Dermatology' 'AllergyandImmunology' 'Neurophysiology'
 'SportsMedicine' 'Perinatology' 'Surgery-PlasticwithinHeadandNeck'
 'Psychiatry-Addictive' 'Pediatrics-InfectiousDiseases'] 

num_lab_procedures -->  118 unique values
 [ 63  54  51  16  64  39  27  65  60  53  43  22  74  40  45  55  31  47
   9  38  19  92  41  57  30  46  49   3  56  35  44  84  80  86  20  61
  15  23  25  77  72  67   1  58  34  68  24  52  28  50  33  12   6  17
  66  42  73  59  48  76  26  36  29   8  70   5  85  10  32  87  78  69
  37  18  11  79  21   4   2  14  62  71  13  75  83   7  97  82  93  96
  88  94  89  81  95  91  90 104  98  99 100 113 103 101 102 107 106 111
 109 108 121 114 105 118 126 120 132 129] 

num_procedures -->  7 unique values
 [0 6 1 3 5 4 2] 

num_medications -->  75 unique values
 [ 7  8 19  3 14 13 15 31 12 42 11 39 17 10  6 27 23 18 16 20  4 21 30 38
  5 28 33  2 26 32  9 22 29 24 25 35 34 37 36  1 47 55 41 46 44 43 51 68
 45 59 64 40 62 49 52 53 60 50 57 69 48 56 54 67 61 81 58 65 63 74 66 70
 72 75 79] 

number_outpatient -->  39 unique values
 [ 0  7  3  2  1  4  6  5 24  9 15 11  8 14 12 10 22 17 21 28 13 18 16 27
 37 29 19 33 35 34 39 23 20 25 42 40 26 36 38] 

number_emergency -->  33 unique values
 [ 0  1  2  3  7  4  5  6 10 13 37 11 16  8  9 21 14 12 64 15 18 22 42 19
 20 24 46 54 76 29 63 28 25] 

number_inpatient -->  21 unique values
 [ 0  2  1  4  6  3  7  5 10  8 16  9 11 12 15 21 13 18 14 19 17] 

diag_1 -->  717 unique values
 ['38' '414' '513' '558' '789' '427' '996' '486' '428' '443' '296' '453'
 '724' '491' '8' '424' '518' '823' '786' '198' '716' '682' '780' 'V57'
 '188' '41' '435' '153' '250.13' '201' '530' '154' '781' '482' '380' '562'
 '714' '599' '278' '514' '410' '537' '403' '584' '250.03' '434' '995'
 '998' '600' '719' '276' '250.8' '535' '813' '648' '707' '282' '557' '493'
 '250.12' '285' '787' '577' '404' '466' '184' '531' '821' '211' '820'
 '501' '440' '250.6' '569' '250.83' '437' '572' '935' '250.02' '250.7'
 '715' '331' '294' '560' '536' '402' '481' '997' '681' '430' '593' '696'
 '280' '596' '578' '671' '202' '574' '411' '458' '595' '571' '185' '788'
 '291' '784' '433' '733' '401' '552' '233' '310' '345' '277' '496' '576'
 'V58' '355' '585' '250.81' '808' '822' '654' '415' '532' '515' '250.2'
 '203' '490' '831' '335' '607' '295' '250' '659' '398' '592' '250.4' '288'
 '346' '423' '438' '722' '250.82' '436' '790' '250.31' '824' '541' '431'
 '413' '432' '880' '644' '507' '?' '812' '426' '785' '250.22' '492' '189'
 '920' '250.11' '197' '710' '293' '799' '378' '783' '237' '487' '590'
 '850' '456' '442' 'V54' '146' '959' '726' '661' '457' '340' '250.42'
 '359' '307' '620' '540' '425' 'V55' '575' '348' '292' '730' '199' '805'
 '250.92' '162' '218' '245' '226' '965' '250.1' '619' '550' '567' '807'
 '728' '665' '455' '446' '852' '171' '152' '632' '519' '738' '967' '298'
 '464' '386' '447' '281' '729' '604' '579' '656' '42' '205' '216' '174'
 '642' '847' '311' '641' '225' '581' '360' '625' '825' '735' '618' '851'
 '934' '196' '924' '9' '441' '511' '157' '238' '465' '923' '112' '155'
 '242' '223' '204' '840' '473' '274' '802' '962' '553' '660' '250.21'
 '608' '999' '860' '983' '250.41' '150' '459' '906' '782' '394' '309'
 '626' '416' '420' '534' '444' '290' '429' '556' '721' '354' '711' '250.5'
 '250.32' '727' '117' '263' '891' '151' '921' '88' '564' '792' '555'
 '250.01' '70' '478' '512' '605' '383' '305' '972' '303' '271' '396' '54'
 '191' '220' '53' '854' '485' '182' '969' '79' '989' '528' '986' 'V45'
 '566' '253' '323' 'V53' '522' '332' '252' '283' '945' '864' '736' '922'
 '239' '368' '658' '583' '246' '573' '284' '241' '516' '135' '494' '250.3'
 '627' '952' '39' '336' '300' '614' '421' '588' '881' '287' '454' '680'
 '462' '882' '723' '836' '806' '451' '958' '286' '333' '848' '227' '183'
 '275' '793' '3' '244' '510' '136' '594' '611' '395' '35' '273' '871'
 '756' '47' '992' '694' '384' '49' '669' '617' '621' '324' '304' '591'
 '521' '193' '334' '586' '601' '526' '652' '338' '215' '936' '751' '251'
 '5' '963' '693' '163' '705' '362' '527' '161' '235' '320' '718' '149'
 '312' '987' '663' '803' '57' '342' '195' '944' '893' '568' '250.93' '615'
 '357' '250.33' '844' '897' '942' '691' '212' '31' '250.52' '192' '379'
 '240' '461' '801' '351' '297' '372' '616' '180' '971' '843' '250.43'
 'V56' '358' '475' '141' 'V63' '376' '991' '994' '480' '94' '633' '853'
 '837' '933' '565' '603' '370' '250.23' '533' '964' '78' '708' '158' '725'
 '349' '745' '865' '179' '692' '737' '353' 'V60' '452' '474' '867' '655'
 '873' '862' '835' '250.53' '892' '551' '646' '794' '866' '914' '255'
 '647' '508' '875' '172' '870' '816' '791' '250.9' '382' '34' '214' '645'
 '266' 'V51' '219' '845' '110' '236' '706' '826' '463' '230' '470' '815'
 '272' '148' '361' '543' '827' '397' '11' '327' '143' '753' '391' '306'
 '968' '262' '695' '356' 'V67' '977' '747' '405' '208' '717' '966' '445'
 '797' '928' '422' '200' '365' '664' '250.91' '640' '890' '622' '653'
 '759' '623' 'V25' '483' '879' '344' '861' '156' '916' '610' '289' '363'
 '698' '142' '709' '814' '506' '173' '970' '686' '61' '337' '325' '131'
 '500' '147' '374' '495' 'V66' '350' '810' '448' '10' '674' 'E909' '27'
 'V71' '170' '833' '598' '804' '82' '75' '301' '58' '731' '795' '164'
 '746' '883' '84' '175' '66' '796' '637' '388' '649' '570' '703' '732'
 '904' '417' '477' '210' '228' '863' '352' '720' '366' '800' '834' '529'
 '886' '373' '685' '917' '913' '341' '377' '133' '582' '868' '700' '684'
 '412' '261' '990' '911' '895' '602' '279' '634' '878' '524' '48' '973'
 '988' '471' '846' '980' '7' '115' '974' '976' '683' '318' '347' '389'
 '194' '381' '207' '643' 'V70' '939' '842' '23' '580' '36' '98' '322'
 '523' '308' '187' '817' '941' '839' '385' '915' '832' '375' 'V07' '299'
 '114' 'V26' '657' '145' '955' '369' '982' '919' '52' '160' '838' '734'
 '217' '542' '957' '704' '975' '250.51' '690' '903' '885' '229' '314'
 'V43' '97'] 

diag_2 -->  749 unique values
 ['294' '411' '250' '724' '728' '585' '998' '250.01' '403' '277' '496'
 '427' '707' '250.13' '780' 'E818' '486' '250.92' '906' '414' '574' '428'
 '204' '296' '38' '581' '584' '681' '284' '599' '493' '250.02' '292'
 'E942' '276' '518' '788' '567' '425' '459' '530' '710' '401' '70' '782'
 '711' '682' '438' '433' '162' '?' '426' '250.6' '424' '293' '272' '280'
 '285' '348' '564' '511' '593' '577' '731' '491' '456' '112' '440' '999'
 '358' '553' '288' '41' '282' '648' '305' '413' '421' '303' '250.2' '410'
 '590' '250.03' 'E878' '592' '578' '244' '600' '995' '198' '287' '402'
 '785' '135' '337' '729' '217' '189' '997' '694' '996' '250.91' '924'
 '790' '575' '786' '435' '250.4' '203' '733' '458' '453' '250.41' '799'
 'E947' '576' '289' '536' '300' '291' '591' '787' '197' '396' '351' '812'
 '416' '8' '562' '784' '492' '571' '279' '342' '250.8' 'E812' '540' 'V43'
 '338' 'V45' '250.22' '404' '434' '283' '373' '250.82' '153' '312' '465'
 '340' '820' 'V85' 'E885' '569' '397' '151' '718' '726' '174' '263' '327'
 '722' '250.1' '521' '278' 'V10' '568' '295' '344' 'E935' '304' 'E888'
 '873' 'V15' '719' '446' '515' '415' '482' '684' '451' '442' '789' '557'
 '481' '443' '250.51' 'V42' '196' '642' 'V09' 'E849' 'V49' '625' '286'
 '560' '603' '494' '447' '730' 'V62' '805' '836' '233' '202' '783' '331'
 '475' '664' '507' '42' '715' '444' '705' '250.12' '618' '368' '537' '242'
 '359' '781' '490' 'V58' '595' '696' '565' '596' '275' '309' '495' '394'
 '332' '281' '601' '816' '412' '919' '345' '324' '250.11' 'E880' '466'
 '865' '211' '519' 'E905' 'E950' '250.81' '357' '218' '573' '661' '558'
 '516' 'E936' '821' '208' '250.43' '362' '138' '721' 'V12' '586' '920'
 '252' '320' '566' '311' '349' '262' '378' '850' '736' '512' '535' '824'
 '191' '255' '250.7' 'V46' '680' '713' '796' '155' '614' '261' '250.42'
 '314' 'E879' '531' '201' '429' '572' 'E933' '583' '150' '727' '430' '420'
 '205' '794' '611' '807' 'E932' '473' '623' '250.53' '716' '579' '851'
 '550' '607' '319' 'V64' '959' '922' '709' '714' '183' '619' '621' '9'
 '517' '200' '141' '823' '620' '792' 'E850' '226' '398' '157' '290' '608'
 '813' '552' '431' '250.93' '94' '913' '723' '386' '31' '867' '508' '626'
 '478' '514' '274' 'V65' '659' '356' '185' '758' '487' '251' '322' '532'
 '852' '845' '847' '802' '610' '455' '382' '225' '54' '831' 'E884' '555'
 'E939' '510' '990' '250.83' 'V72' '199' '250.52' '383' '437' '712' '307'
 '188' '570' '388' '154' '341' '246' '250.5' 'V17' '923' '423' '693'
 'E917' '253' '152' '464' '172' '273' '379' '298' '905' 'E919' '114' '524'
 '825' '745' '355' '381' '701' 'E941' '598' 'E929' '436' 'V18' '192' '301'
 '934' '844' '462' '685' '360' '706' '972' '372' '686' '717' 'V54' '840'
 '88' '759' '738' '533' '137' '250.3' '457' 'V63' '969' '907' '364' '365'
 '250.9' '958' '34' '646' '674' '323' '380' '910' '454' '645' '737' '79'
 '461' '389' '604' '808' '527' '422' '214' '800' 'E924' '741' '534' '53'
 '753' 'E854' '346' '227' 'V11' '441' '927' '882' '448' '377' '506' '260'
 '523' '238' '145' '644' '588' '432' 'V57' 'V08' '881' 'E816' '66' '842'
 '241' 'E906' '452' '343' '299' '837' '860' '556' '310' '627' '892' '528'
 '747' '652' '617' '369' '110' '691' 'E944' '136' '335' '239' '317' '793'
 '977' '245' '117' '193' '220' '40' '698' '11' '484' '750' '891' '164'
 '250.21' '258' '250.23' '542' 'E829' '791' 'V66' 'E928' 'V70' '975' '917'
 '308' '656' '746' '663' 'E930' '594' 'V16' '751' 'E870' '156' '529'
 'E927' '801' '240' '306' '994' '602' 'E881' 'E819' '853' 'V44' '692'
 '822' '318' 'E858' 'E934' '485' '27' '756' 'E826' '866' '879' '173'
 'E821' '405' '352' '316' 'E883' '460' '861' '893' '864' '111' '477' '513'
 '297' '616' '333' '911' '991' '989' '195' 'E931' '974' '862' 'E937' '869'
 '250.31' '500' '826' '354' '742' '266' '259' '336' '131' '912' '350'
 '909' '945' '580' '725' '641' '232' '376' '163' '483' '921' '474'
 '250.32' '944' '268' '480' '543' '868' '472' 'V61' '734' '463' '605'
 '962' '918' '250.33' 'E968' 'E890' '992' '520' '797' '883' '806' '814'
 '884' '670' '235' 'E918' '647' 'E887' 'E915' '634' '347' '846' 'V14'
 '968' '953' '863' '933' 'E814' 'V69' '952' '140' '171' '501' '130' 'E916'
 '215' 'V23' '702' '302' 'E945' '987' 'E853' '754' '703' 'V86' '522' '755'
 '78' '870' 'E900' 'V55' 'V02' '965' '7' 'E938' '622' '810' 'E882' '615'
 '180' '683' 'E980' '470' '811' 'V50' '366' '955' '795' '179' '75' '654'
 'V60' '832' 'E813' '916' '256' '96' '99' '843' '695' 'E817' '374' '269'
 '815' '752' '880' '980' '963' '325' '871' 'V03' '649' '270' 'V53' '948'
 '115' '353' '967' '395' '748' '35' '872' '223' '228' '665' '5' '942'
 '833' '704' 'E965' '186' '947' 'V25' '46' '658' '908' '52' '894' '271'
 '182' '915' '212' '123' 'E868' 'V13'] 

diag_3 -->  790 unique values
 ['?' '996' '553' '401' '424' '403' '250.51' '428' '414' '250.01' '250'
 '135' '253' '426' '427' '278' '272' '491' '162' '805' '351' '440' '188'
 '496' 'V09' '600' '486' 'V58' '531' '285' '584' '518' '244' '781' '305'
 '413' '585' '250.41' '344' '311' '348' '933' '789' '41' '238' '717' 'V45'
 '599' '438' '198' '647' '38' 'V43' '276' '435' '535' '402' '780' '578'
 '202' '416' '785' '286' '536' '560' '730' '263' '412' '295' '707' '287'
 '250.6' '294' '307' '204' '564' '197' '581' '410' '425' '648' '521' '511'
 '451' '444' '288' '250.02' 'E932' '340' '250.4' '250.8' '786' '437' '289'
 '482' 'V27' '443' 'E849' '153' '997' 'V10' '492' '154' '517' '784' 'V15'
 '335' '493' '447' '995' '610' '591' '296' '530' '250.7' '274' '991' '571'
 '813' '582' '455' '515' '569' '250.92' '694' '519' '303' '799' 'V49'
 '357' '473' '682' 'E888' '453' '300' '433' '465' '790' '654' '537' '397'
 '284' 'V85' '574' '681' 'E885' '292' '782' '698' '458' '998' '583' 'E878'
 '441' '282' '794' '331' '729' '291' '924' '369' '788' '429' '250.5' '70'
 '378' '590' '916' 'E942' '715' '301' '719' '659' '250.82' '293' 'E884'
 '572' '280' '110' '250.42' '304' '577' '565' 'V64' '907' '445' '836'
 '923' '905' '710' '242' '218' '252' '573' '250.52' '380' '690' '255'
 '396' '663' '592' '377' '203' '332' '728' 'V54' '714' '200' '605' '459'
 '616' 'V57' 'V46' '327' '620' '365' '345' '8' '787' '660' '876' '337'
 'E928' '558' '722' '250.03' '759' '185' 'V42' '150' '562' '642' 'V44'
 '411' '281' '820' '356' '494' 'E935' '601' '593' '112' '733' '250.13'
 '338' '618' '724' '873' '346' '404' 'V66' '705' '275' '333' '595' '423'
 '575' 'V11' 'V12' '251' '199' '552' '157' '783' '802' '205' '506' 'E819'
 '240' '870' '250.81' '516' '596' '342' 'E880' '958' '507' '466' '745'
 '368' '355' '298' '500' '812' '79' '490' '910' 'V63' 'E879' '434' '384'
 '847' '456' '693' '312' '921' '711' '625' '685' '457' '744' '966' '723'
 '568' '716' '731' '161' '483' '725' '42' 'V17' '174' '241' '959' '810'
 '266' '570' '741' '386' '309' '533' '358' '969' '250.1' '478' '508' '461'
 '156' '250.22' '607' '598' 'E906' '389' '850' '692' '211' '614' '532'
 '417' '334' '196' '709' 'V62' '246' 'E934' '512' '250.2' '514' '557'
 '550' '824' '840' '182' '646' '319' '53' '882' '189' '297' '621' 'V72'
 '756' '825' '436' '920' '753' '216' '862' '117' '851' 'V08' 'E930' '792'
 '656' 'V14' '343' '801' '173' '271' '446' '290' '816' '737' '617' '270'
 'E950' 'E939' '382' '306' '201' '627' '696' '250.91' '525' '452' '597'
 '734' '277' '501' '225' '664' '365.44' 'E931' '746' '586' '250.11' '758'
 '228' '736' '556' '383' '442' '576' '727' '362' '791' '695' '815' 'E881'
 '354' 'E947' 'E818' '686' '721' '250.12' 'E883' '796' '250.83' '713'
 '555' '405' '999' '522' '608' '861' '588' 'E927' '881' 'E933' '807'
 'E858' '208' '138' '649' 'E850' '892' '842' '793' '567' '372' '718' 'V53'
 '891' '735' '644' '477' '250.53' '183' '250.9' '528' '712' '34' 'E900'
 '579' '283' '652' '394' '420' 'V02' '376' 'E941' '256' '94' '279' '366'
 '398' 'V60' '349' '879' '179' '704' '235' '945' '534' '462' 'E816' '155'
 '808' '239' 'E870' '626' 'V65' 'E828' '658' '310' 'E944' '163' '35' '323'
 '821' '922' '726' '655' '454' '250.43' '834' '831' '9' '852' '273' 'E812'
 '680' '381' '260' '962' '580' '261' '359' '866' '523' '738' '3' 'V18'
 '186' '336' '415' '911' '347' '192' '245' 'V22' '151' '268' 'V03' '460'
 '913' 'E938' 'E929' '317' '987' '909' 'V06' '912' '918' '472' '934' '611'
 '431' '139' '661' '54' '318' '701' 'E887' '867' '88' '262' '795' 'E980'
 '989' '470' '78' '510' 'E916' '485' '542' '619' '395' '951' '11' 'E853'
 '747' '464' 'V61' 'V55' '487' '250.93' '953' 'E956' '250.23' '653' '480'
 '214' '527' '152' '823' '379' '883' '172' '684' '919' '965' '220' '432'
 '191' '643' '594' '800' '860' '481' '141' 'E936' '14' '540' '623' 'E901'
 '543' 'V16' '223' 'E905' '845' '132' 'E892' '227' 'E852' '131' 'E917'
 '374' '315' '751' 'V23' 'E924' '313' '906' 'V13' '180' '421' 'E817' '814'
 '742' '566' 'E965' '17' 'E864' '115' '233' 'E813' '928' '838' '250.3'
 '826' 'E920' 'E966' 'E946' '854' '697' '388' '703' '917' '122' 'E815'
 'E919' '484' '341' '604' '430' 'E943' '250.21' '875' '250.31' '529'
 'E937' '752' 'E922' 'E855' '908' '865' '308' '495' '708' '670' '265'
 '158' '755' '226' '864' '890' 'V86' '822' '674' '665' '706' '797' '370'
 '971' '387' '732' '259' '373' '7' '538' 'V25' '353' '915' '111' '170'
 '314' '893' '944' '164' '750' '193' '299' '872' '877' '448' '972' '47'
 '880' '602' '258' '702' '863' '5' '754' '935' '720' 'E915' '943' '350'
 '837' '215' '123' '136' '955' '811' '669' 'E904' '853' '148' '844' '146'
 'E876' '230' 'V70' '360' '930' '66' 'E822' '948' '243' '841' '952' '475'
 '942' '624' '391' '236' '641' '671' '970' '956' 'E955' '217' 'V07' 'E949'
 '75' 'E894' '967' '385' '871' '175' '27' '848' '171' 'E886' 'E945' 'V01'
 'E861' 'E882' 'E826' '884' '195' '49' '992' '57' 'E854' '657' '603' '868'
 'E865' '524' 'E987' '361' 'E912' 'E825' '757' '622' '463' '980'] 

number_diagnoses -->  16 unique values
 [ 8  6  4  5  9  7  3  2  1 12 16 11 10 13 15 14] 

max_glu_serum -->  4 unique values
 ['None' 'Norm' '>300' '>200'] 

A1Cresult -->  4 unique values
 ['None' 'Norm' '>8' '>7'] 

metformin -->  4 unique values
 ['No' 'Up' 'Steady' 'Down'] 

repaglinide -->  4 unique values
 ['No' 'Steady' 'Up' 'Down'] 

nateglinide -->  4 unique values
 ['No' 'Steady' 'Up' 'Down'] 

chlorpropamide -->  4 unique values
 ['No' 'Steady' 'Down' 'Up'] 

glimepiride -->  4 unique values
 ['No' 'Steady' 'Down' 'Up'] 

acetohexamide -->  2 unique values
 ['No' 'Steady'] 

glipizide -->  4 unique values
 ['No' 'Down' 'Steady' 'Up'] 

glyburide -->  4 unique values
 ['No' 'Steady' 'Up' 'Down'] 

tolbutamide -->  2 unique values
 ['No' 'Steady'] 

pioglitazone -->  4 unique values
 ['No' 'Steady' 'Up' 'Down'] 

rosiglitazone -->  4 unique values
 ['No' 'Steady' 'Up' 'Down'] 

acarbose -->  4 unique values
 ['No' 'Steady' 'Up' 'Down'] 

miglitol -->  4 unique values
 ['No' 'Steady' 'Down' 'Up'] 

troglitazone -->  2 unique values
 ['No' 'Steady'] 

tolazamide -->  3 unique values
 ['No' 'Steady' 'Up'] 

examide -->  1 unique values
 ['No'] 

citoglipton -->  1 unique values
 ['No'] 

insulin -->  4 unique values
 ['No' 'Steady' 'Up' 'Down'] 

glyburide-metformin -->  4 unique values
 ['No' 'Steady' 'Down' 'Up'] 

glipizide-metformin -->  2 unique values
 ['No' 'Steady'] 

glimepiride-pioglitazone -->  2 unique values
 ['No' 'Steady'] 

metformin-rosiglitazone -->  2 unique values
 ['No' 'Steady'] 

metformin-pioglitazone -->  2 unique values
 ['No' 'Steady'] 

change -->  2 unique values
 ['No' 'Ch'] 

diabetesMed -->  2 unique values
 ['No' 'Yes'] 

readmitted -->  3 unique values
 ['NO' '>30' '<30']


                        
                            def missing_val(df):
    print("% of missing values for race: ",(len(df.loc[df['race'] == '?'])/len(df))*100,"%")
    print("% of missing values for gender: ",(len(df.loc[df['gender'] == 'Unknown/Invalid'])/len(df))*100,"%")
    print("% of missing values for weight: ",(len(df.loc[df['weight'] == '?'])/len(df))*100,"%")
    print("% of missing values for payer_code: ",(len(df.loc[df['payer_code'] == '?'])/len(df))*100,"%")
    print("% of missing values for medical_specialty: ",(len(df.loc[df['medical_specialty'] == '?'])/len(df))*100,"%")
    print("% of missing values for diag_1: ",(len(df.loc[df['diag_1'] == '?'])/len(df))*100,"%")
    print("% of missing values for diag_2: ",(len(df.loc[df['diag_2'] == '?'])/len(df))*100,"%")
    print("% of missing values for diag_3: ",(len(df.loc[df['diag_3'] == '?'])/len(df))*100,"%")
    print("% of missing values for citoglipton: ",(len(df.loc[df['citoglipton'] == 'No'])/len(df))*100,"%")
    print("% of missing values for examide: ",(len(df.loc[df['examide'] == 'No'])/len(df))*100,"%")
    print("% of missing values for metformin-pioglitazone: ",(len(df.loc[df['metformin-pioglitazone'] == 'No'])/len(df))*100,"%")
    print("% of missing values for metformin-rosiglitazone: ",(len(df.loc[df['metformin-rosiglitazone'] == 'No'])/len(df))*100,"%")

    
print("\nMissing values in the training set:\n")
missing_val(df_train)
print("\nMissing values in the test set:\n")
missing_val(df_test)

Missing values in the training set:

% of missing values for race:  2.2378282060688646 %
% of missing values for gender:  0.003930611603165453 %
% of missing values for weight:  96.86992296001257 %
% of missing values for payer_code:  39.36900581730517 %
% of missing values for medical_specialty:  49.10906136994917 %
% of missing values for diag_1:  0.02620407735443635 %
% of missing values for diag_2:  0.36161626749122167 %
% of missing values for diag_3:  1.4163303810072847 %
% of missing values for citoglipton:  100.0 %
% of missing values for examide:  100.0 %
% of missing values for metformin-pioglitazone:  99.99868979613228 %
% of missing values for metformin-rosiglitazone:  99.99737959226455 %

Missing values in the test set:

% of missing values for race:  2.2207373634148255 %
% of missing values for gender:  0.0 %
% of missing values for weight:  96.82414904488641 %
% of missing values for payer_code:  40.12263186856379 %
% of missing values for medical_specialty:  49.001650813615285 %
% of missing values for diag_1:  0.003930508607813851 %
% of missing values for diag_2:  0.3223017058407358 %
% of missing values for diag_3:  1.3442339438723372 %
% of missing values for citoglipton:  100.0 %
% of missing values for examide:  100.0 %
% of missing values for metformin-pioglitazone:  100.0 %
% of missing values for metformin-rosiglitazone:  100.0 %


                        
                            df.drop(['weight','payer_code','Unnamed: 0','medical_specialty','citoglipton','examide'], axis=1, inplace=True)


                        
                            print('Relative frequencies of patient readmissions (Training set):')
print(np.round(df_train['readmitted'].value_counts(normalize=True), decimals=4))

print('\nRelative frequencies of patient readmissions (Test set):')
print(np.round(df_test['readmitted'].value_counts(normalize=True), decimals=4))

fig, axes = plt.subplots(1,2)
sns.set(rc={'figure.figsize':(15,8)})
sns.countplot(df_train['readmitted'],ax=axes[0]).set_title('Distribution of patient readmission \n(Training set)')
sns.countplot(df_test['readmitted'],ax=axes[1]).set_title('Distribution of patient readmission \n(Test set)')
fig.show()

df.replace({'readmitted': {'NO': 0,'<30': 1,'>30': 0}}, inplace=True)

Relative frequencies of patient readmissions (Training set):
NO     0.5391
>30    0.3493
<30    0.1116
Name: readmitted, dtype: float64

Relative frequencies of patient readmissions (Test set):
NO     0.5391
>30    0.3493
<30    0.1116
Name: readmitted, dtype: float64


                        
                            df['Class'] = (df.readmitted == 1).astype('int')


                        
                            df = df.replace('?',np.nan)
df = df.replace('Unknown/Invalid',np.nan)
print(df.isnull().sum())

encounter_id                   0
patient_nbr                    0
race                        2273
gender                         3
age                            0
admission_type_id              0
discharge_disposition_id       0
admission_source_id            0
time_in_hospital               0
num_lab_procedures             0
num_procedures                 0
num_medications                0
number_outpatient              0
number_emergency               0
number_inpatient               0
diag_1                        21
diag_2                       358
diag_3                      1423
number_diagnoses               0
max_glu_serum                  0
A1Cresult                      0
metformin                      0
repaglinide                    0
nateglinide                    0
chlorpropamide                 0
glimepiride                    0
acetohexamide                  0
glipizide                      0
glyburide                      0
tolbutamide                    0
pioglitazone                   0
rosiglitazone                  0
acarbose                       0
miglitol                       0
troglitazone                   0
tolazamide                     0
insulin                        0
glyburide-metformin            0
glipizide-metformin            0
glimepiride-pioglitazone       0
metformin-rosiglitazone        0
metformin-pioglitazone         0
change                         0
diabetesMed                    0
readmitted                     0
Class                          0
dtype: int64


                        
                            df['race'] = df['race'].replace(np.nan,'Other')
misscol = ['diag_1','diag_2','diag_3','gender']
for col in misscol:
    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    df[col] = imputer.fit_transform(df[col].values.reshape(-1,1))[:,0]


                        
                            ageMap = {'[0-10)' : 1,'[10-20)' : 2,'[20-30)' : 3, '[30-40)' : 4, '[40-50)' : 5, '[50-60)' : 6,'[60-70)' : 7, '[70-80)' : 8,
'[80-90)' : 9,'[90-100)' : 10}

df['age'] = df['age'].apply(lambda x : ageMap[x])


                        
                            df['diag_1'] = df['diag_1'].apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1)  
                                        else ('circulatory' if int(float(x)) in range(390, 460) or int(float(x)) == 785
                                        else     ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786
                                        else     ('digestive'   if int(float(x)) in range(520, 580) or int(float(x)) == 787
                                        else     ('diabetes'    if int(float(x)) == 250
                                        else     ('injury'      if int(float(x)) in range(800, 1000)
                                        else ('musculoskeletal' if int(float(x)) in range(710, 740)
                                        else ('genitourinary'   if int(float(x)) in range(580, 630) or int(float(x)) == 788
                                        else ('neoplasms'       if int(float(x)) in range(140, 240)
                                        else 'other')))))))))

df['diag_2'] = df['diag_2'].apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1)  
                                        else ('circulatory' if int(float(x)) in range(390, 460) or int(float(x)) == 785
                                        else     ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786
                                        else     ('digestive'   if int(float(x)) in range(520, 580) or int(float(x)) == 787
                                        else     ('diabetes'    if int(float(x)) == 250
                                        else     ('injury'      if int(float(x)) in range(800, 1000)
                                        else ('musculoskeletal' if int(float(x)) in range(710, 740)
                                        else ('genitourinary'   if int(float(x)) in range(580, 630) or int(float(x)) == 788
                                        else ('neoplasms'       if int(float(x)) in range(140, 240)
                                        else 'other')))))))))

df['diag_3'] = df['diag_3'].apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1)  
                                        else ('circulatory' if int(float(x)) in range(390, 460) or int(float(x)) == 785
                                        else     ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786
                                        else     ('digestive'   if int(float(x)) in range(520, 580) or int(float(x)) == 787
                                        else     ('diabetes'    if int(float(x)) == 250
                                        else     ('injury'      if int(float(x)) in range(800, 1000)
                                        else ('musculoskeletal' if int(float(x)) in range(710, 740)
                                        else ('genitourinary'   if int(float(x)) in range(580, 630) or int(float(x)) == 788
                                        else ('neoplasms'       if int(float(x)) in range(140, 240)
                                        else 'other')))))))))


                        
                            fig = plt.figure(figsize=(10,8))
sns.countplot(y= df['age'], hue = df['Class']).set_title('Age of Patient VS. Readmission')

fig = plt.figure(figsize=(8,8))
sns.countplot(df['change'], hue = df['Class']).set_title('Change of Medication VS. Readmission')

Text(0.5, 1.0, 'Change of Medication VS. Readmission')


                        
                            fig = plt.figure(figsize=(8,5))
sns.countplot(df['gender'], hue = df['Class']).set_title("Gender of Patient VS. Readmission")

Text(0.5, 1.0, 'Gender of Patient VS. Readmission')


                        
                            #diabetic medication vs readmission

fig = plt.figure(figsize=(8,8))
sns.countplot(df['diabetesMed'], hue = df['Class']).set_title('Diabetes Medication prescribed VS Readmission')

Text(0.5, 1.0, 'Diabetes Medication prescribed VS Readmission')


                        
                            #max_glue_serum vs target
fig = plt.figure(figsize=(8,5))
sns.countplot(y = df['max_glu_serum'], hue = df['Class']).set_title('Glucose test serum test result VS. Readmission')

Text(0.5, 1.0, 'Glucose test serum test result VS. Readmission')


                        
                            #discharge_disposition_id VS. Readmission
fig = plt.figure(figsize=(8,8))
sns.countplot(y= df['discharge_disposition_id'], hue = df['Class']).set_title('discharge_disposition_id VS. Readmission')

Text(0.5, 1.0, 'discharge_disposition_id VS. Readmission')


                        
                            # Numerical features
feat_num = ['time_in_hospital','num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient','number_diagnoses']

# Categorical features
feat_cat = ['race', 'gender', 
       'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin','glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone','change', 'diabetesMed','metformin-pioglitazone','metformin-rosiglitazone']


# Categorical numerical features
feat_cat_num = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id','age','diag_1','diag_2', 'diag_3']


                        
                            df[feat_cat_num] = df[feat_cat_num].astype('str')

df_cat = pd.get_dummies(df[feat_cat + feat_cat_num],drop_first = True)


# Merge the original dataframe and the dataframe with encoded columns
df = pd.concat([df,df_cat], axis = 1)

# Storing the names of all encoded categorical column names in a list
feat_all_cat = list(df_cat.columns)


                        
                            print('Total number of features:', len(feat_num + feat_all_cat))
print('Numerical Features:',len(feat_num))
print('Categorical Features:',len(feat_all_cat))

Total number of features: 150
Numerical Features: 8
Categorical Features: 142


                        
                            features = feat_num + feat_all_cat
df_data = df[features + ['Class'] + ['encounter_id']]
df_data.shape

(101766, 152)


                        
                            testset = df_data['encounter_id'].isin(test_ids)
trainset = df_data['encounter_id'].isin(train_ids)

df_test = df_data[testset]
df_train = df_data[trainset]


                        
                            print("Test set proportion to the whole dataset: ",(len(df_test)/len(df_data)))

Test set proportion to the whole dataset:  0.25000491323231727


                        
                            df_test.drop(['encounter_id'],axis=1,inplace=True)
df_train.drop(['encounter_id'],axis=1,inplace=True)


                        
                            print("Training set shape: ",df_train.shape)
print("Test set shape    : ",df_test.shape)

Training set shape:  (76324, 151)
Test set shape    :  (25442, 151)


                        
                            print(df_train.groupby(['Class']).size())
sns.set(rc={'figure.figsize':(5,5)})
sns.countplot(df_train['Class']).set_title('Class distribution (Training set)')

Class
0    67806
1     8518
dtype: int64

Text(0.5, 1.0, 'Class distribution (Training set)')


                        
                            # Separate input features and target for training set
y_train_all = df_train.Class
X_train_all = df_train.drop('Class', axis=1)

# Separate input features and target for test set
y_test = df_test.Class
X_test = df_test.drop('Class', axis=1)

# Split training data into validation and train set
X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, test_size=0.33334206802,random_state=42)

print("Training set shape: ",X_train.shape)
print("Validation set shape: ",X_valid.shape)
print("Test set shape: ",X_test.shape)

Training set shape:  (50882, 150)
Validation set shape:  (25442, 150)
Test set shape:  (25442, 150)


                        
                            print("Class distribution BEFORE oversampling: ", Counter(y_train))

smote = SMOTE(sampling_strategy='minority')

X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

# Summarize class distribution
print("Class distribution AFTER oversampling : ", Counter(y_train_over))

Class distribution BEFORE oversampling:  Counter({0: 45138, 1: 5744})
Class distribution AFTER oversampling :  Counter({0: 45138, 1: 45138})


                        
                            print("Class distribution BEFORE undersampling: ", Counter(y_train))

# define undersampling strategy
oversample = RandomUnderSampler(sampling_strategy='majority')

# fit and apply the transform
X_train_under, y_train_under = oversample.fit_resample(X_train, y_train)

# summarize class distribution
print("Class distribution AFTER undersampling : ", Counter(y_train_under))

Class distribution BEFORE undersampling:  Counter({0: 45138, 1: 5744})
Class distribution AFTER undersampling :  Counter({0: 5744, 1: 5744})


                        
                            def specificity_calc(y_true, y_pred, thresh):
    spec = sum((y_pred < thresh) & (y_true == 0)) /sum(y_true ==0)
    return spec

def evaluation_report(y_true, y_pred, y_pred_probs, thresh):
    auc = roc_auc_score(y_true, y_pred_probs)
    accuracy = accuracy_score(y_true, (y_pred > thresh))
    recall = recall_score(y_true, (y_pred > thresh))
    precision = precision_score(y_true, (y_pred > thresh))
    specificity = specificity_calc(y_true, y_pred, thresh)
    print("Performance metrics:")
    print('AUC: %.3f'%auc)
    print('Accuracy: %.3f'%accuracy)
    print('Recall: %.3f'%recall)
    print('Precision: %.3f'%precision)
    print('Specificity: %.3f'%specificity)
    print('\n')
    return auc, accuracy, recall, precision, specificity

def ROCcurve(y_true,y_pred_probs):
    fpr, tpr, thresh = metrics.roc_curve(y_true, y_pred_probs)
    roc_auc = metrics.auc(fpr, tpr)
    plt.figure(figsize=(7,7))
    plt.style.use('seaborn')
    plt.rcParams.update({'font.size':17})
    plt.title('Receiver Operating Characteristics Curve')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()


                        
                            threshold = 0.5


                        
                            lr_under = LogisticRegression()
lr_under.fit(X_train_under, y_train_under)

y_pred_under_valid = lr_under.predict(X_valid)
y_pred_probs_valid = lr_under.predict_proba(X_valid)[:,1]

y_pred_under_train = lr_under.predict(X_train_under)
y_pred_probs_train = lr_under.predict_proba(X_train_under)[:,1]


                        
                            print('LOGISTIC REGRESSION (with undersampled training data)\n')
targ_label = ['Class 0', 'Class 1']
print("TRAINING SET REPORT:")
print(classification_report(y_train_under, y_pred_under_train, target_names=targ_label))

lr_under_auc_train,lr_under_accuracy_train,lr_under_recall_train,lr_under_precision_train,lr_under_spec_train =\
evaluation_report(y_train_under, y_pred_under_train,y_pred_probs_train, threshold)

print("\nVALIDATION SET REPORT:")
print(classification_report(y_valid, y_pred_under_valid, target_names=targ_label))

lr_under_auc,lr_under_accuracy,lr_under_recall,lr_under_precision,lr_under_spec =\
evaluation_report(y_valid, y_pred_under_valid,y_pred_probs_valid, threshold)
ROCcurve(y_valid,y_pred_probs_valid)

LOGISTIC REGRESSION (with undersampled training data)

TRAINING SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.60      0.68      0.64      5744
     Class 1       0.63      0.56      0.59      5744

    accuracy                           0.62     11488
   macro avg       0.62      0.62      0.62     11488
weighted avg       0.62      0.62      0.62     11488

Performance metrics:
AUC: 0.666
Accuracy: 0.617
Recall: 0.555
Precision: 0.634
Specificity: 0.680


VALIDATION SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.93      0.67      0.78     22668
     Class 1       0.18      0.56      0.27      2774

    accuracy                           0.66     25442
   macro avg       0.55      0.62      0.52     25442
weighted avg       0.84      0.66      0.72     25442

Performance metrics:
AUC: 0.664
Accuracy: 0.663
Recall: 0.565
Precision: 0.175
Specificity: 0.675


                        
                            lr_over = LogisticRegression()
lr_over.fit(X_train_over, y_train_over)

y_pred_over_valid = lr_over.predict(X_valid)
y_pred_probs_valid = lr_over.predict_proba(X_valid)[:,1]

y_pred_over_train = lr_over.predict(X_train_over)
y_pred_probs_train = lr_over.predict_proba(X_train_over)[:,1]


                        
                            print('LOGISTIC REGRESSION (with oversampled training data)\n')
targ_label = ['Class 0', 'Class 1']

print("TRAINING SET REPORT:")
print(classification_report(y_train_over, y_pred_over_train, target_names=targ_label))
lr_over_auc_train,lr_over_accuracy_train,lr_over_recall_train,lr_over_precision_train,lr_over_spec_train =\
evaluation_report(y_train_over, y_pred_over_train,y_pred_probs_train, threshold)
ROCcurve(y_train_over,y_pred_probs_train)

print("VALIDATION SET REPORT:")
print(classification_report(y_valid, y_pred_over_valid, target_names=targ_label))

lr_over_auc,lr_over_accuracy,lr_over_recall,lr_over_precision,lr_over_spec =\
evaluation_report(y_valid, y_pred_over_valid,y_pred_probs_valid, threshold)
ROCcurve(y_valid,y_pred_probs_valid)

LOGISTIC REGRESSION (with oversampled training data)

TRAINING SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.88      0.95      0.91     45138
     Class 1       0.95      0.87      0.90     45138

    accuracy                           0.91     90276
   macro avg       0.91      0.91      0.91     90276
weighted avg       0.91      0.91      0.91     90276

Performance metrics:
AUC: 0.940
Accuracy: 0.908
Recall: 0.865
Precision: 0.946
Specificity: 0.951

VALIDATION SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.89      0.95      0.92     22668
     Class 1       0.16      0.08      0.11      2774

    accuracy                           0.86     25442
   macro avg       0.53      0.51      0.51     25442
weighted avg       0.81      0.86      0.83     25442

Performance metrics:
AUC: 0.560
Accuracy: 0.856
Recall: 0.079
Precision: 0.165
Specificity: 0.951


                        
                            tree_under = DecisionTreeClassifier(max_depth = 5)
tree_under.fit(X_train_under, y_train_under)
y_pred_under_valid = tree_under.predict(X_valid)
y_pred_probs_valid = tree_under.predict_proba(X_valid)[:,1]

y_pred_under_train = tree_under.predict(X_train_under)
y_pred_probs_train = tree_under.predict_proba(X_train_under)[:,1]


                        
                            print('DECISION TREE CLASSIFIER (with undersampled training data)\n')
targ_label = ['Class 0', 'Class 1']

print("TRAINING SET REPORT:")
print(classification_report(y_train_under, y_pred_under_train, target_names=targ_label))
evaluation_report(y_train_under, y_pred_under_train,y_pred_probs_train, threshold)


print("VALIDATION SET REPORT:")
print(classification_report(y_valid, y_pred_over_valid, target_names=targ_label))
dt_under_auc,dt_under_accuracy,dt_under_recall,dt_under_precision,dt_under_spec =\
evaluation_report(y_valid, y_pred_under_valid,y_pred_probs_valid, threshold)
ROCcurve(y_valid,y_pred_probs_valid)

DECISION TREE CLASSIFIER (with undersampled training data)

TRAINING SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.61      0.60      0.60      5744
     Class 1       0.61      0.62      0.61      5744

    accuracy                           0.61     11488
   macro avg       0.61      0.61      0.61     11488
weighted avg       0.61      0.61      0.61     11488

Performance metrics:
AUC: 0.645
Accuracy: 0.608
Recall: 0.617
Precision: 0.606
Specificity: 0.598


VALIDATION SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.89      0.95      0.92     22668
     Class 1       0.16      0.08      0.11      2774

    accuracy                           0.86     25442
   macro avg       0.53      0.51      0.51     25442
weighted avg       0.81      0.86      0.83     25442

Performance metrics:
AUC: 0.652
Accuracy: 0.606
Recall: 0.643
Precision: 0.165
Specificity: 0.602


                        
                            tree_over = DecisionTreeClassifier(max_depth = 5)
tree_over.fit(X_train_over, y_train_over)
y_pred_over_valid = tree_over.predict(X_valid)
y_pred_probs_valid = tree_over.predict_proba(X_valid)[:,1]

y_pred_over_train = tree_over.predict(X_train_over)
y_pred_probs_train = tree_over.predict_proba(X_train_over)[:,1]


                        
                            print('DECISION TREE CLASSIFIER (with oversampled training data)\n')
targ_label = ['Class 0', 'Class 1']

print("TRAINING SET REPORT:")
print(classification_report(y_train_over, y_pred_over_train, target_names=targ_label))
evaluation_report(y_train_over, y_pred_over_train,y_pred_probs_train, threshold)

print("VALIDATION SET REPORT:")
print(classification_report(y_valid, y_pred_over_valid, target_names=targ_label))
dt_over_auc,dt_over_accuracy,dt_over_recall,dt_over_precision,dt_over_spec =\
evaluation_report(y_valid, y_pred_over_valid,y_pred_probs_valid, threshold)
ROCcurve(y_valid,y_pred_probs_valid)

DECISION TREE CLASSIFIER (with oversampled training data)

TRAINING SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.72      0.83      0.77     45138
     Class 1       0.80      0.67      0.73     45138

    accuracy                           0.75     90276
   macro avg       0.76      0.75      0.75     90276
weighted avg       0.76      0.75      0.75     90276

Performance metrics:
AUC: 0.809
Accuracy: 0.752
Recall: 0.671
Precision: 0.801
Specificity: 0.833


VALIDATION SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.90      0.83      0.86     22668
     Class 1       0.13      0.21      0.16      2774

    accuracy                           0.77     25442
   macro avg       0.52      0.52      0.51     25442
weighted avg       0.81      0.77      0.79     25442

Performance metrics:
AUC: 0.527
Accuracy: 0.766
Recall: 0.211
Precision: 0.134
Specificity: 0.833


                        
                            rf_under=RandomForestClassifier(max_depth = 10)
rf_under.fit(X_train_under, y_train_under)
y_pred_under_train = rf_under.predict(X_train_under)
y_pred_probs_train = rf_under.predict_proba(X_train_under)[:,1]

y_pred_under_valid = rf_under.predict(X_valid)
y_pred_probs_valid = rf_under.predict_proba(X_valid)[:,1]


                        
                            print('RANDOM FOREST CLASSIFIER (with undersampled training data)\n')
targ_label = ['Class 0', 'Class 1']
print("TRAINING SET REPORT:")
print(classification_report(y_train_under, y_pred_under_train, target_names=targ_label))
evaluation_report(y_train_under, y_pred_under_train,y_pred_probs_train, threshold)

print("VALIDATION SET REPORT:")
print(classification_report(y_valid, y_pred_under_valid, target_names=targ_label))
rf_under_auc,rf_under_accuracy,rf_under_recall,rf_under_precision,rf_under_spec =\
evaluation_report(y_valid, y_pred_under_valid,y_pred_probs_valid, threshold)
ROCcurve(y_valid,y_pred_probs_valid)

RANDOM FOREST CLASSIFIER (with undersampled training data)

TRAINING SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.69      0.78      0.73      5744
     Class 1       0.75      0.64      0.69      5744

    accuracy                           0.71     11488
   macro avg       0.72      0.71      0.71     11488
weighted avg       0.72      0.71      0.71     11488

Performance metrics:
AUC: 0.792
Accuracy: 0.712
Recall: 0.642
Precision: 0.747
Specificity: 0.782


VALIDATION SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.93      0.68      0.79     22668
     Class 1       0.18      0.55      0.27      2774

    accuracy                           0.67     25442
   macro avg       0.55      0.62      0.53     25442
weighted avg       0.84      0.67      0.73     25442

Performance metrics:
AUC: 0.668
Accuracy: 0.669
Recall: 0.552
Precision: 0.176
Specificity: 0.683


                        
                            rf_over=RandomForestClassifier(max_depth = 10)
rf_over.fit(X_train_over, y_train_over)
y_pred_over_train = rf_over.predict(X_train_over)
y_pred_probs_train = rf_over.predict_proba(X_train_over)[:,1]

y_pred_over_valid = rf_over.predict(X_valid)
y_pred_probs_valid = rf_over.predict_proba(X_valid)[:,1]


                        
                            print('RANDOM FOREST CLASSIFIER (with oversampled training data)\n')
targ_label = ['Class 0', 'Class 1']
print("TRAINING SET REPORT:")
print(classification_report(y_train_over, y_pred_over_train, target_names=targ_label))
evaluation_report(y_train_over, y_pred_over_train,y_pred_probs_train, threshold)

print("VALIDATION SET REPORT:")
print(classification_report(y_valid, y_pred_over_valid, target_names=targ_label))
rf_over_auc,rf_over_accuracy,rf_over_recall,rf_over_precision,rf_over_spec =\
evaluation_report(y_valid, y_pred_over,y_pred_probs, threshold)
ROCcurve(y_valid,y_pred_probs_valid)

RANDOM FOREST CLASSIFIER (with oversampled training data)

TRAINING SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.86      0.92      0.89     45138
     Class 1       0.92      0.84      0.88     45138

    accuracy                           0.88     90276
   macro avg       0.89      0.88      0.88     90276
weighted avg       0.89      0.88      0.88     90276

Performance metrics:
AUC: 0.943
Accuracy: 0.884
Recall: 0.844
Precision: 0.917
Specificity: 0.923


VALIDATION SET REPORT:
              precision    recall  f1-score   support

     Class 0       0.89      0.92      0.91     22668
     Class 1       0.15      0.11      0.13      2774

    accuracy                           0.83     25442
   macro avg       0.52      0.52      0.52     25442
weighted avg       0.81      0.83      0.82     25442

Performance metrics:
AUC: 0.504
Accuracy: 0.834
Recall: 0.072
Precision: 0.108
Specificity: 0.927


                        
                            df_results = pd.DataFrame({'classifier':['LR','LR','DT','DT','RF','RF'],
                           'sampling type':['undersampled','oversampled']*3,
                          'auc':[lr_under_auc,lr_over_auc,dt_under_auc,dt_over_auc,rf_under_auc,rf_over_auc],
                          'accuracy':[lr_under_accuracy,lr_over_accuracy,dt_under_accuracy,dt_over_accuracy,\
                                      rf_under_accuracy,rf_over_accuracy],
                          'recall':[lr_under_recall,lr_over_recall,dt_under_recall,dt_over_recall,\
                                    rf_under_recall,rf_over_recall],
                          'precision':[lr_under_precision,lr_over_precision,dt_under_precision,dt_over_precision,\
                                       rf_under_precision,rf_over_precision],
                          'specificity':[lr_under_spec,lr_over_spec,dt_under_spec,dt_over_spec,rf_under_spec,rf_over_spec]})


                        
                            print("BASELINE MODEL PERFORMANCE ON VALIDATION SET")

sns.set(style="darkgrid")

fig, axs = plt.subplots(ncols=2)
sns.set(rc={'figure.figsize':(16,8)})
ax = sns.barplot(x="classifier", y="auc", hue="sampling type", data=df_results,ax=axs[0],palette="dark:salmon")
ax.set_xlabel('Classifier',fontsize = 15)
ax.set_ylabel('AUC', fontsize = 15)
ax.tick_params(labelsize=15)

sns.set(rc={'figure.figsize':(16,8)})
ax = sns.barplot(x="classifier", y="accuracy", hue="sampling type", data=df_results,ax=axs[1],palette="dark:salmon")
ax.set_xlabel('Classifier',fontsize = 15)
ax.set_ylabel('Accuracy', fontsize = 15)
ax.tick_params(labelsize=15)

fig, axs = plt.subplots(ncols=2)
sns.set(rc={'figure.figsize':(16,8)})
ax = sns.barplot(x="classifier", y="precision", hue="sampling type", data=df_results,ax=axs[0],palette="dark:salmon")
ax.set_xlabel('Classifier',fontsize = 15)
ax.set_ylabel('Precision', fontsize = 15)
ax.tick_params(labelsize=15)

sns.set(rc={'figure.figsize':(16,8)})
ax = sns.barplot(x="classifier", y="recall", hue="sampling type", data=df_results,ax=axs[1],palette="dark:salmon")
ax.set_xlabel('Classifier',fontsize = 15)
ax.set_ylabel('Recall', fontsize = 15)
ax.tick_params(labelsize=15)
plt.show()

BASELINE MODEL PERFORMANCE ON VALIDATION SET


                        
                            lr_under.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


                        
                            n_jobs = range(100,1000,200)
penalty = ['l1', 'l2']
solver = ['liblinear']
C = [100, 10, 1.0, 0.1, 0.01]

random_grid = {'n_jobs':n_jobs,
              'penalty':penalty,
              'solver':solver,
              'C':C}

print(random_grid)

{'n_jobs': range(100, 1000, 200), 'penalty': ['l1', 'l2'], 'solver': ['liblinear'], 'C': [100, 10, 1.0, 0.1, 0.01]}


                        
                            auc_scoring = make_scorer(roc_auc_score)


                        
                            # Creating the randomized search cross-validation

lr_under_random = RandomizedSearchCV(estimator = lr_under, param_distributions = random_grid, 
                               n_iter = 20, cv = 2, scoring=auc_scoring,
                               verbose = 1)


                        
                            # Fitting the Random Search model created to our data

t1 = time.time()
lr_under_random.fit(X_train_under, y_train_under)
t2 = time.time()
print(t2-t1)

Fitting 2 folds for each of 20 candidates, totalling 40 fits
106.93133902549744


                        
                            lr_under_random.best_params_

{'solver': 'liblinear', 'penalty': 'l1', 'n_jobs': 100, 'C': 1.0}


                        
                            lr_under = LogisticRegression()
lr_under.fit(X_train_under, y_train_under)
y_pred_under = lr_under.predict(X_valid)
y_pred_probs = lr_under.predict_proba(X_valid)[:,1]

print('LOGISTIC REGRESSION model (BASELINE)\n')
targ_label = ['Class 0', 'Class 1']

lr_base_auc,lr_base_accuracy,lr_base_recall,lr_base_precision,lr_base_spec =\
evaluation_report(y_valid, y_pred_under,y_pred_probs, threshold)

LOGISTIC REGRESSION model (BASELINE)

Performance metrics:
AUC: 0.664
Accuracy: 0.663
Recall: 0.565
Precision: 0.175
Specificity: 0.675


                        
                            y_pred_under_random = lr_under_random.best_estimator_.predict(X_valid)
y_pred_probs_random = lr_under_random.best_estimator_.predict_proba(X_valid)[:,1]

print('LOGISTIC REGRESSION model (OPTIMIZED)\n')
targ_label = ['Class 0', 'Class 1']

lr_opti_auc,lr_opti_accuracy,lr_opti_recall,lr_opti_precision,lr_opti_spec =\
evaluation_report(y_valid, y_pred_under_random,y_pred_probs_random, threshold)

LOGISTIC REGRESSION model (OPTIMIZED)

Performance metrics:
AUC: 0.671
Accuracy: 0.671
Recall: 0.555
Precision: 0.177
Specificity: 0.685


                        
                            tree_under.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}


                        
                            criterion = ['entropy','gini']
max_depth = [1,3,6,9,10,18,23]
max_features = [2,3,6,9,10,15,19,27,35,40,78,150]
min_samples_split = [2,6,9,10]

random_grid_dt = {'criterion':criterion,
              'max_depth':max_depth,
                 'max_features':max_features,
                 'min_samples_split':min_samples_split}

print(random_grid_dt)

{'criterion': ['entropy', 'gini'], 'max_depth': [1, 3, 6, 9, 10, 18, 23], 'max_features': [2, 3, 6, 9, 10, 15, 19, 27, 35, 40, 78, 150], 'min_samples_split': [2, 6, 9, 10]}


                        
                            auc_scoring = make_scorer(roc_auc_score)

# Creating the randomized search cross-validation

tree_under_random = RandomizedSearchCV(estimator = tree_under, param_distributions = random_grid_dt, 
                               n_iter = 20, cv = 2, scoring=auc_scoring,
                               verbose = 1)


                        
                            # Fitting the Random Search model created to our data

t1 = time.time()
tree_under_random.fit(X_train_under, y_train_under)
t2 = time.time()
print(t2-t1)

Fitting 2 folds for each of 20 candidates, totalling 40 fits
2.3679769039154053


                        
                            tree_under_random.best_params_

{'min_samples_split': 2,
 'max_features': 150,
 'max_depth': 6,
 'criterion': 'entropy'}


                        
                            tree_under = DecisionTreeClassifier(max_depth = 5)
tree_under.fit(X_train_under, y_train_under)
y_pred_under = tree_under.predict(X_valid)
y_pred_probs = tree_under.predict_proba(X_valid)[:,1]


print('DECISION TREE CLASSIFIER (BASELINE)\n')
targ_label = ['Class 0', 'Class 1']

dt_base_auc,dt_base_accuracy,dt_base_recall,dt_base_precision,dt_base_spec =\
evaluation_report(y_valid, y_pred_under,y_pred_probs, threshold)

DECISION TREE CLASSIFIER (BASELINE)

Performance metrics:
AUC: 0.652
Accuracy: 0.606
Recall: 0.643
Precision: 0.165
Specificity: 0.602


                        
                            y_pred_under_random = tree_under_random.best_estimator_.predict(X_valid)
y_pred_probs_random = tree_under_random.best_estimator_.predict_proba(X_valid)[:,1]

print('DECISION TREE CLASSIFIER (OPTIMIZED)\n')
targ_label = ['Class 0', 'Class 1']

dt_opti_auc,dt_opti_accuracy,dt_opti_recall,dt_opti_precision,dt_opti_spec =\
evaluation_report(y_valid, y_pred_under_random,y_pred_probs_random, threshold)

DECISION TREE CLASSIFIER (OPTIMIZED)

Performance metrics:
AUC: 0.648
Accuracy: 0.610
Recall: 0.628
Precision: 0.164
Specificity: 0.608


                        
                            rf_under.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


                        
                            n_estimators = range(200,1000,500)
max_depth = range(5,15,5)
criterion = ['entropy']

random_grid_rf = {'n_estimators':n_estimators,
              'max_depth':max_depth,
              'criterion':criterion}

print(random_grid_rf)

{'n_estimators': range(200, 1000, 500), 'max_depth': range(5, 15, 5), 'criterion': ['entropy']}


                        
                            auc_scoring = make_scorer(roc_auc_score)

rf_random = RandomizedSearchCV(estimator = rf_under, param_distributions = random_grid_rf, n_iter = 10, cv = 2,
                               scoring=auc_scoring,verbose = 1)


                        
                            # Fitting the random search model to our data
t1 = time.time()
rf_random.fit(X_train_under, y_train_under)
t2 = time.time()
print(t2-t1)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
37.010154247283936


                        
                            rf_random.best_params_

{'n_estimators': 200, 'max_depth': 10, 'criterion': 'entropy'}


                        
                            rf_under = RandomForestClassifier(max_depth = 10)
rf_under.fit(X_train_under, y_train_under)
y_pred_under = rf_under.predict(X_valid)
y_pred_probs = rf_under.predict_proba(X_valid)[:,1]

print('RANDOM FOREST CLASSIFIER (BASELINE)\n')
targ_label = ['Class 0', 'Class 1']

rf_base_auc,rf_base_accuracy,rf_base_recall,rf_base_precision,rf_base_spec =\
evaluation_report(y_valid, y_pred_under_random,y_pred_probs_random, threshold)

RANDOM FOREST CLASSIFIER (BASELINE)

Performance metrics:
AUC: 0.648
Accuracy: 0.610
Recall: 0.628
Precision: 0.164
Specificity: 0.608


                        
                            y_pred_under_random = rf_random.best_estimator_.predict(X_valid)
y_pred_probs_random = rf_random.best_estimator_.predict_proba(X_valid)[:,1]

print('RANDOM FOREST CLASSIFIER (OPTIMIZED)\n')
targ_label = ['Class 0', 'Class 1']

rf_opti_auc,rf_opti_accuracy,rf_opti_recall,rf_opti_precision,rf_opti_spec =\
evaluation_report(y_valid, y_pred_under_random,y_pred_probs_random, threshold)

RANDOM FOREST CLASSIFIER (OPTIMIZED)

Performance metrics:
AUC: 0.671
Accuracy: 0.667
Recall: 0.558
Precision: 0.176
Specificity: 0.681


                        
                            df_results = pd.DataFrame({'classifier':['LR','LR','DT','DT','RF','RF'],
                           'model_type':['optimized','baseline']*3,
                          'auc':[lr_opti_auc,lr_base_auc,dt_opti_auc,dt_base_auc,rf_opti_auc,rf_base_auc],
                          'accuracy':[lr_opti_accuracy,lr_base_accuracy,dt_opti_accuracy,dt_base_accuracy,\
                                      rf_opti_accuracy,rf_base_accuracy],
                          'recall':[lr_opti_recall,lr_base_recall,dt_opti_recall,dt_base_recall,\
                                    rf_opti_recall,rf_base_recall],
                          'precision':[lr_opti_precision,lr_base_precision,dt_opti_precision,dt_base_precision,\
                                       rf_opti_precision,rf_base_precision],
                          'specificity':[lr_opti_spec,lr_base_spec,dt_opti_spec,dt_base_spec,rf_opti_spec,rf_base_spec]})


                        
                            print("OPTIMIZED MODEL PERFORMANCE ON VALIDATION SET")

sns.set(style="darkgrid")

fig, axs = plt.subplots(ncols=2)
sns.set(rc={'figure.figsize':(16,8)})
ax = sns.barplot(x="classifier", y="auc", hue="model_type", data=df_results,ax=axs[0],palette="dark:salmon")
ax.set_xlabel('Classifier',fontsize = 15)
ax.set_ylabel('AUC', fontsize = 15)
ax.tick_params(labelsize=15)

sns.set(rc={'figure.figsize':(16,8)})
ax = sns.barplot(x="classifier", y="accuracy", hue="model_type", data=df_results,ax=axs[1],palette="dark:salmon")
ax.set_xlabel('Classifier',fontsize = 15)
ax.set_ylabel('Accuracy', fontsize = 15)
ax.tick_params(labelsize=15)

fig, axs = plt.subplots(ncols=2)
sns.set(rc={'figure.figsize':(16,8)})
ax = sns.barplot(x="classifier", y="precision", hue="model_type", data=df_results,ax=axs[0],palette="dark:salmon")
ax.set_xlabel('Classifier',fontsize = 15)
ax.set_ylabel('Precision', fontsize = 15)
ax.tick_params(labelsize=15)

sns.set(rc={'figure.figsize':(16,8)})
ax = sns.barplot(x="classifier", y="recall", hue="model_type", data=df_results,ax=axs[1],palette="dark:salmon")
ax.set_xlabel('Classifier',fontsize = 15)
ax.set_ylabel('Recall', fontsize = 15)
ax.tick_params(labelsize=15)
plt.show()

OPTIMIZED MODEL PERFORMANCE ON VALIDATION SET


                        
                            y_pred_under_random = lr_under_random.best_estimator_.predict(X_test)
y_pred_probs_random = lr_under_random.best_estimator_.predict_proba(X_test)[:,1]

print('LOGISTIC REGRESSION MODEL\n')

lr_opti_auc,lr_opti_accuracy,lr_opti_recall,lr_opti_precision,lr_opti_spec =\
evaluation_report(y_test, y_pred_under_random,y_pred_probs_random, threshold)

LOGISTIC REGRESSION MODEL

Performance metrics:
AUC: 0.663
Accuracy: 0.670
Recall: 0.541
Precision: 0.178
Specificity: 0.686


                        
                            y_pred_under_random = rf_random.best_estimator_.predict(X_test)
y_pred_probs_random = rf_random.best_estimator_.predict_proba(X_test)[:,1]

print('RANDOM FOREST CLASSIFIER\n')

rf_opti_auc,rf_opti_accuracy,rf_opti_recall,rf_opti_precision,rf_opti_spec =\
evaluation_report(y_test, y_pred_under_random,y_pred_probs_random, threshold)

RANDOM FOREST CLASSIFIER

Performance metrics:
AUC: 0.665
Accuracy: 0.671
Recall: 0.549
Precision: 0.180
Specificity: 0.687

Prediction of patients' readmission to hospital¶

(Binary Classification Problem)¶

by Suvrat Jain¶

Overview¶

Goal:¶

Workflow:¶

Table of Contents¶

Dataset description¶

Each row of the dataset represents information about a patient and their respective hospital related outcome.

Importing libraries & packages¶

Reading the dataset¶

Dataset exploration¶

Data preprocessing¶

Exploratory Data Analysis¶

Feature Selection¶

Imbalance dataset handling¶

Random Oversampling (SMOTE)¶

Random Undersampling¶

Model Building¶

Logistic Regression¶

Decision Tree Classifier¶

Random Forest Classifier¶

Analysis of baseline models¶

Model Selection: Hyperparameter Tuning¶

Logistic Regression: Hyperparameter Tuning¶

Decision Tree Classifier: Hyperparameter Tuning¶

Random Forest Classifier: Hyperparameter Tuning¶

Model Selection & Evaluation¶

Conclusion¶

	Unnamed: 0	encounter_id	patient_nbr	race	gender	age	weight	admission_type_id	discharge_disposition_id	admission_source_id	...	citoglipton	insulin	glyburide-metformin	glipizide-metformin	glimepiride-pioglitazone	metformin-rosiglitazone	metformin-pioglitazone	change	diabetesMed	readmitted
0	93634	338369606	159605546	AfricanAmerican	Male	[70-80)	?	1	23	7	...	No	No	No	No	No	No	No	No	No	NO
1	27698	90739116	83232054	Caucasian	Female	[70-80)	[75-100)	3	1	1	...	No	No	No	No	No	No	No	No	No	NO
2	17488	64286964	95117913	Caucasian	Male	[60-70)	?	3	1	1	...	No	No	No	No	No	No	No	Ch	Yes	>30
3	96510	376610012	136697972	AfricanAmerican	Male	[40-50)	?	3	1	1	...	No	No	No	No	No	No	No	No	No	NO
4	35774	110557278	2851308	Caucasian	Male	[80-90)	?	1	1	6	...	No	No	No	No	No	No	No	No	No	NO