データ分析⑧

# PART1 基礎分析をしよう
# おまじない
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# データの読み込み
# train.csvとtest.csvとsubmit_sample.csvの3つのファイルを読み込みましょう
# それぞれを読み込んだものは変数はtrain, test, sampleに代入しましょう
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("submit_sample.csv", header=None)

# trainの先頭行を見てみましょう
# 各カラムの説明はDeepAnalyticsのデータダウンロードページに記載されています
train.head()
id	age	job	marital	education	default	balance	housing	loan	contact	day	month	duration	campaign	pdays	previous	poutcome	y
0	1	39	blue-collar	married	secondary	no	1756	yes	no	cellular	3	apr	939	1	-1	0	unknown	1
1	2	51	entrepreneur	married	primary	no	1443	no	no	cellular	18	feb	172	10	-1	0	unknown	1
2	3	36	management	single	tertiary	no	436	no	no	cellular	13	apr	567	1	595	2	failure	1
3	4	63	retired	married	secondary	no	474	no	no	cellular	25	jan	423	1	-1	0	unknown	1
4	5	31	management	single	tertiary	no	354	no	no	cellular	30	apr	502	1	9	2	success	1

# trainの行数と列数を見てみましょう
train.shape
(27128, 18)

# trainの基礎統計量を見てみましょう
# train.describe()
id	age	balance	day	duration	campaign	pdays	previous	y
count	27128.000000	27128.000000	27128.000000	27128.000000	27128.000000	27128.000000	27128.000000	27128.000000	27128.000000
mean	13564.500000	40.951010	1355.800870	15.806215	260.711295	2.751769	40.528052	0.579733	0.117001
std	7831.323388	10.608542	3003.305272	8.337904	260.091727	3.126594	100.382462	2.503653	0.321427
min	1.000000	18.000000	-6847.000000	1.000000	0.000000	1.000000	-1.000000	0.000000	0.000000
25%	6782.750000	33.000000	72.000000	8.000000	104.000000	1.000000	-1.000000	0.000000	0.000000
50%	13564.500000	39.000000	449.000000	16.000000	182.000000	2.000000	-1.000000	0.000000	0.000000
75%	20346.250000	48.000000	1428.000000	21.000000	323.000000	3.000000	-1.000000	0.000000	0.000000
max	27128.000000	95.000000	102127.000000	31.000000	4918.000000	63.000000	871.000000	275.000000	1.000000

# testの基礎統計量も見てみましょう
test.describe()
id	age	balance	day	duration	campaign	pdays	previous
count	18083.000000	18083.000000	18083.000000	18083.000000	18083.000000	18083.000000	18083.000000	18083.000000
mean	9042.000000	40.914008	1371.980092	15.806725	254.340264	2.781950	39.702428	0.581209
std	5220.256794	10.634331	3105.985293	8.299509	253.591981	3.054651	99.747634	1.965265
min	1.000000	18.000000	-8019.000000	1.000000	0.000000	1.000000	-1.000000	0.000000
25%	4521.500000	33.000000	72.000000	8.000000	102.000000	1.000000	-1.000000	0.000000
50%	9042.000000	39.000000	447.000000	16.000000	178.000000	2.000000	-1.000000	0.000000
75%	13562.500000	48.000000	1427.000000	21.000000	314.000000	3.000000	-1.000000	0.000000
max	18083.000000	95.000000	98417.000000	31.000000	3881.000000	50.000000	854.000000	58.000000

# trainとtestに欠損がないか見てみましょう
train.isnull().sum()
id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64
test.isnull().sum()
id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
dtype: int64

# trainでyが1の人数はどれくらいか見てみましょう
train["y"].value_counts()
0    23954
1     3174
Name: y, dtype: int64

# maritalとyのクロス集計をしてみましょう
pd.crosstab関数を使います
pd.crosstab(X["A"], X["B"])と書いた場合、Aが縦列、Bが横列となります

# 更にオプションとしてmargins=Trueを書くと、総計値のカラムまで作成されるので便利です
pd.crosstab(train["marital"],train["y"],margins=True)
pd.crosstab(train["marital"],train["y"],margins=True)
y	0	1	All
marital			
divorced	2691	364	3055
married	14744	1667	16411
single	6519	1143	7662
All	23954	3174	27128

pd.crosstab(train["y"],train["marital"],margins=True)
marital	divorced	married	single	All
y				
0	2691	14744	6519	23954
1	364	1667	1143	3174
All	3055	16411	7662	27128

# ageをビニングしてみましょう
# ageは数値データなので、クロス集計をする為にはビニングが必要です
# ビニングとは数値データを例えば、グループ①(0より大きい、10以下)、グループ②(10より大きい、20以下)…のように集約することを言います
# ビニングはpd.cut関数を使います
# オプションには、①ビニングしたいデータ、②どう区切るのか?(例えば[0,10,20])を書きます
# まずtrain["age"]の基本統計量を確認しましょう
# その後、ビニングした結果を変数age_biningに代入しましょう
train["age"].describe()
count    27128.000000
mean        40.951010
std         10.608542
min         18.000000
25%         33.000000
50%         39.000000
75%         48.000000
max         95.000000
Name: age, dtype: float64


# age_bining = pd.cut(train["age"], [0, 20, 30, 40, 50, 60, 100])
# age_biningとyを使ってクロス集計をしてみましょう
age_bining
0         (30, 40]
1         (50, 60]
2         (30, 40]
3        (60, 100]
4         (30, 40]
          ...    
27123     (40, 50]
27124     (30, 40]
27125     (30, 40]
27126     (30, 40]
27127     (20, 30]
Name: age, Length: 27128, dtype: category
Categories (6, interval[int64]): [(0, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 60] < (60, 100]]

pd.crosstab(age_bining,train["y"],margins=True)
y	0	1	All
age			
(0, 20]	39	21	60
(20, 30]	3486	655	4141
(30, 40]	9488	1111	10599
(40, 50]	6117	602	6719
(50, 60]	4416	488	4904
(60, 100]	408	297	705
All	23954	3174	27128

# 実習
# 1.testの行数とカラム数を確かめてみよう

.shape
test.shape
(18083, 17)

# 2.trainのデータには数値データが何カラムあり、質的データが何カラムあるか調べてみよう

train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27128 entries, 0 to 27127
Data columns (total 18 columns):
#   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
0   id         27128 non-null  int64 
1   age        27128 non-null  int64 
2   job        27128 non-null  object
3   marital    27128 non-null  object
4   education  27128 non-null  object
5   default    27128 non-null  object
6   balance    27128 non-null  int64 
7   housing    27128 non-null  object
8   loan       27128 non-null  object
9   contact    27128 non-null  object
10  day        27128 non-null  int64 
11  month      27128 non-null  object
12  duration   27128 non-null  int64 
13  campaign   27128 non-null  int64 
14  pdays      27128 non-null  int64 
15  previous   27128 non-null  int64 
16  poutcome   27128 non-null  object
17  y          27128 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 3.7+ MB

# 3.poutcomeとyのクロス集計をし、変数poutに代入しよう

# margins=Trueはつけましょう
pout = pd.crosstab(train["poutcome"],train["y"],margins=True)
out
pout
y	0	1	All
poutcome			
failure	2578	391	2969
other	950	173	1123
success	312	574	886
unknown	20114	2036	22150
All	23954	3174	27128

# 4.trainのpoutcomeの各値毎に1となる割合を計算し、poutに新たなカラムrateを追加しよう

# poutcomeの各値の合計値に対して1となる割を求めることになるので、pout[1] / pout["All"] で求めることができます
pout["rate"] = pout[1] / pout["All"]
pout
pout
y	0	1	All	rate
poutcome				
failure	2578	391	2969	0.131694
other	950	173	1123	0.154052
success	312	574	886	0.647856
unknown	20114	2036	22150	0.091919
All	23954	3174	27128	0.117001

# 5.trainのdurationをビニングして、変数duration_biningを作りましょう

# train["duration"]の基礎統計量を確認しておきましょう
# 区切り方は[-1,100,200,300,400,500,600,700,800,900,1000,5000]としましょう
train["duration"].describe()
count    27128.000000
mean       260.711295
std        260.091727
min          0.000000
25%        104.000000
50%        182.000000
75%        323.000000
max       4918.000000
Name: duration, dtype: float64

duration_bining = pd.cut(train["duration"],  [-1,100,200,300,400,500,600,700,800,900,1000,5000])
duration_bining
0        (900, 1000]
1         (100, 200]
2         (500, 600]
3         (400, 500]
4         (500, 600]
           ...     
27123     (200, 300]
27124     (300, 400]
27125     (400, 500]
27126      (-1, 100]
27127     (100, 200]
Name: duration, Length: 27128, dtype: category
Categories (11, interval[int64]): [(-1, 100] < (100, 200] < (200, 300] < (300, 400] ... (700, 800] < (800, 900] < (900, 1000] < (1000, 5000]]

# 6.duration_biningとyを使って、クロス集計をし、変数duraに代入しよう

# margins=Trueはつけましょう
dura = pd.crosstab(duration_bining,train["y"],margins=True)
dura
dura
y	0	1	All
duration			
(-1, 100]	6379	60	6439
(100, 200]	7909	450	8359
(200, 300]	4286	557	4843
(300, 400]	2258	408	2666
(400, 500]	1231	308	1539
(500, 600]	699	270	969
(600, 700]	441	259	700
(700, 800]	245	189	434
(800, 900]	149	153	302
(900, 1000]	95	123	218
(1000, 5000]	262	397	659
All	23954	3174	27128

# 7.duraの各値毎に1となる割合を計算し、duraに新たなカラムrateを追加しよう

All
dura["rate"] = dura[1] / dura["All"]
dura
dura
y	0	1	All	rate
duration				
(-1, 100]	6379	60	6439	0.009318
(100, 200]	7909	450	8359	0.053834
(200, 300]	4286	557	4843	0.115011
(300, 400]	2258	408	2666	0.153038
(400, 500]	1231	308	1539	0.200130
(500, 600]	699	270	969	0.278638
(600, 700]	441	259	700	0.370000
(700, 800]	245	189	434	0.435484
(800, 900]	149	153	302	0.506623
(900, 1000]	95	123	218	0.564220
(1000, 5000]	262	397	659	0.602428
All	23954	3174	27128	0.117001

この記事が気に入ったらサポートをしてみませんか?