数据分析任务1

数据分析任务(1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

#导入数据
data = pd.read_csv('D:\statistical\data.csv', encoding='gbk')

#查看数据概况
data.info()

#查看数据前5行
data.head()

#统计每列的缺失值个数
data.isnull().sum().sort_values(ascending=False)

#'student_feature' 这列考虑删除, 或者将NA以0填充
data[['student_feature', 'status']].groupby(data['status']).count()

cat_vars = []
def fea_categorical_check(df):
print('描述变量有:\n')
for col in df.columns:
if df[col].dtype == 'object':
print(col)
cat_vars.append(col)
return cat_vars


#测试集30%,训练集70%,随机种子设置为2018
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data,y_data,train_size=0.7,random_state=2018