Skip to content

Latest commit

 

History

History
127 lines (116 loc) · 6.5 KB

File metadata and controls

127 lines (116 loc) · 6.5 KB

GCDSteps

# From: http://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data)
d <- read.table('german.data',
                sep  =  " ",
                stringsAsFactors  =  FALSE, header  =  FALSE)

colnames(d) <- c('Status_of_existing_checking_account', 'Duration_in_month',
                 'Credit_history', 'Purpose', 'Credit_amount', 'Savings_account_bonds', 
                 'Present_employment_since',
                 'Installment_rate_in_percentage_of_disposable_income',
                 'Personal_status_and_sex', 'Other_debtors_guarantors',
                 'Present_residence_since', 'Property', 'Age_in_years',
                 'Other_installment_plans', 'Housing',
                 'Number_of_existing_credits_at_this_bank', 'Job',
                 'Number_of_people_being_liable_to_provide_maintenance_for',
                 'Telephone', 'foreign_worker', 'Good_Loan')
str(d)
## 'data.frame':    1000 obs. of  21 variables:
##  $ Status_of_existing_checking_account                     : chr  "A11" "A12" "A14" "A11" ...
##  $ Duration_in_month                                       : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ Credit_history                                          : chr  "A34" "A32" "A34" "A32" ...
##  $ Purpose                                                 : chr  "A43" "A43" "A46" "A42" ...
##  $ Credit_amount                                           : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ Savings_account_bonds                                   : chr  "A65" "A61" "A61" "A61" ...
##  $ Present_employment_since                                : chr  "A75" "A73" "A74" "A74" ...
##  $ Installment_rate_in_percentage_of_disposable_income     : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ Personal_status_and_sex                                 : chr  "A93" "A92" "A93" "A93" ...
##  $ Other_debtors_guarantors                                : chr  "A101" "A101" "A101" "A103" ...
##  $ Present_residence_since                                 : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ Property                                                : chr  "A121" "A121" "A121" "A122" ...
##  $ Age_in_years                                            : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ Other_installment_plans                                 : chr  "A143" "A143" "A143" "A143" ...
##  $ Housing                                                 : chr  "A152" "A152" "A152" "A153" ...
##  $ Number_of_existing_credits_at_this_bank                 : int  2 1 1 1 2 1 1 1 1 2 ...
##  $ Job                                                     : chr  "A173" "A173" "A172" "A173" ...
##  $ Number_of_people_being_liable_to_provide_maintenance_for: int  1 1 2 2 2 2 1 1 1 1 ...
##  $ Telephone                                               : chr  "A192" "A191" "A191" "A191" ...
##  $ foreign_worker                                          : chr  "A201" "A201" "A201" "A201" ...
##  $ Good_Loan                                               : int  1 2 1 1 2 1 1 1 1 2 ...
d$Good_Loan <- as.factor(ifelse(d$Good_Loan == 1, 'GoodLoan', 'BadLoan'))

mapping <- c('A11'  =  '... < 0 DM',
             'A12'  =  '0 < =  ... < 200 DM',
             'A13'  =  '... > =  200 DM / salary assignments for at least 1 year',
             'A14'  =  'no checking account',
             'A30'  =  'no credits taken/all credits paid back duly',
             'A31'  =  'all credits at this bank paid back duly',
             'A32'  =  'existing credits paid back duly till now',
             'A33'  =  'delay in paying off in the past',
             'A34'  =  'critical account/other credits existing (not at this bank)',
             'A40'  =  'car (new)',
             'A41'  =  'car (used)',
             'A42'  =  'furniture/equipment',
             'A43'  =  'radio/television',
             'A44'  =  'domestic appliances',
             'A45'  =  'repairs',
             'A46'  =  'education',
             'A47'  =  '(vacation - does not exist?)',
             'A48'  =  'retraining',
             'A49'  =  'business',
             'A410'  =  'others',
             'A61'  =  '... < 100 DM',
             'A62'  =  '100 < =  ... < 500 DM',
             'A63'  =  '500 < =  ... < 1000 DM',
             'A64'  =  '.. > =  1000 DM',
             'A65'  =  'unknown/ no savings account',
             'A71'  =  'unemployed',
             'A72'  =  '... < 1 year',
             'A73'  =  '1 < =  ... < 4 years',
             'A74'  =  '4 < =  ... < 7 years',
             'A75'  =  '.. > =  7 years',
             'A91'  =  'male : divorced/separated',
             'A92'  =  'female : divorced/separated/married',
             'A93'  =  'male : single',
             'A94'  =  'male : married/widowed',
             'A95'  =  'female : single',
             'A101'  =  'none',
             'A102'  =  'co-applicant',
             'A103'  =  'guarantor',
             'A121'  =  'real estate',
             'A122'  =  'if not A121 : building society savings agreement/life insurance',
             'A123'  =  'if not A121/A122 : car or other, not in attribute 6',
             'A124'  =  'unknown / no property',
             'A141'  =  'bank',
             'A142'  =  'stores',
             'A143'  =  'none',
             'A151'  =  'rent',
             'A152'  =  'own',
             'A153'  =  'for free',
             'A171'  =  'unemployed/ unskilled - non-resident',
             'A172'  =  'unskilled - resident',
             'A173'  =  'skilled employee / official',
             'A174'  =  'management/ self-employed/highly qualified employee/ officer',
             'A191'  =  'none',
             'A192'  =  'yes, registered under the customers name',
             'A201'  =  'yes',
             'A202'  =  'no')
for(ci in colnames(d)) {
  if(is.character(d[[ci]])) {
    d[[ci]] <- as.factor(mapping[d[[ci]]])
  }
}



vars <- setdiff(colnames(d), 'Good_Loan')
creditdata <- d

saveRDS(creditdata, "creditdata.RDS")

# not part of GCD data- notional example for listing 1.3
tab1 <- as.table(matrix(data = c(50, 6, 0, 44), nrow = 2, ncol = 2))
dimnames(tab1) <- list('loan_as_pct_disposable_income' = c('LT.15pct','GT.15pct'),
                       'loan_quality_pop1' = c('goodloan', 'badloan'))
tab2 <- as.table(matrix(data = c(34,18,16,32), nrow = 2, ncol = 2))
dimnames(tab2) <- list('loan_as_pct_disposable_income' = c('LT.15pct', 'GT.15pct'),
                       'loan_quality_pop2' = c('goodloan', 'badloan'))

save(list = ls(), file = 'GCDData.RData')