【CASE】芝加哥犯罪率数据集(CatBoostClassifier)

参考:top 2% based on CatBoostClassifier

导入库与数据

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
import catboost
import gensim

data_train = pd.read_csv("C:\\Users\\Nihil\\Documents\\pythonlearn\\data\\kaggle\\sf-crime\\train.csv")
data_test = pd.read_csv("C:\\Users\\Nihil\\Documents\\pythonlearn\\data\\kaggle\\sf-crime\\test.csv")

特征处理

def transformTimeDataset(dataset):
    dataset['Dates'] = pd.to_datetime(dataset['Dates'])
    dataset['Date'] = dataset['Dates'].dt.date
    dataset['n_days'] = (dataset['Date']-dataset['Date'].min()).apply(lambda x:x.days)
    dataset['Year'] = dataset['Dates'].dt.year
    dataset['DayOfWeek'] = dataset['Dates'].dt.dayofweek
    dataset['WeekOfYear'] = dataset['Dates'].dt.weekofyear
    dataset['Month'] = dataset['Dates'].dt.month
    dataset['Hour'] = dataset['Dates'].dt.hour
    return dataset
data_train = transformTimeDataset(data_train)
data_test = transformTimeDataset(data_test)
def transformdGeoDataset(dataset):
    dataset['Block'] = dataset['Address'].str.contains('block', case=False)
    dataset['Block'] = dataset['Block'].map(lambda x: 1 if x == True else 0)
    dataset = pd.get_dummies(data=dataset, columns=['PdDistrict'], drop_first=True)
    return dataset
data_train = transformdGeoDataset(data_train)
data_test = transformdGeoDataset(data_test)
data_train = data_train.drop(["Descript", "Resolution","Address","Dates","Date"], axis = 1)
data_test = data_test.drop(["Address","Dates","Date"], axis = 1)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_train.Category = le.fit_transform(data_train.Category)

设置特征与目标

X = data_train.drop("Category",axis=1)
y = data_train['Category']
print(X.head())
     DayOfWeek    X          Y       n_days  Year  WeekOfYear  Month  Hour  Block  PdDistrict_CENTRAL  PdDistrict_INGLESIDE  PdDistrict_MISSION  PdDistrict_NORTHERN  PdDistrict_PARK  PdDistrict_RICHMOND  PdDistrict_SOUTHERN  PdDistrict_TARAVAL  PdDistrict_TENDERLOIN
0          2 -122.425892  37.774599    4510  2015          20      5    23      0                   0                     0                   0                    1                0                    0                    0                   0                      0
1          2 -122.425892  37.774599    4510  2015          20      5    23      0                   0                     0                   0                    1                0                    0                    0                   0                      0
2          2 -122.424363  37.800414    4510  2015          20      5    23      0                   0                     0                   0                    1                0                    0                    0                   0                      0
3          2 -122.426995  37.800873    4510  2015          20      5    23      1                   0                     0                   0                    1                0                    0                    0                   0                      0
4          2 -122.438738  37.771541    4510  2015          20      5    23      1                   0                     0                   0                    0                1                    0                    0                   0                      0

先把处理的数据封装起来输出(节省时间)

data_train = pd.DataFrame(data_train)
data_train.to_csv("C:\\Users\\Nihil\\Documents\\pythonlearn\\data\\Results\\Crimedatatrain.csv")

重新导入

data_train = pd.read_csv("C:\\Users\\Nihil\\Documents\\pythonlearn\\data\\Results\\Crimedatatrain.csv")
data_test = pd.read_csv("C:\\Users\\Nihil\\Documents\\pythonlearn\\data\\Results\\Crimedatatest.csv")

print(data_train.head())
       Category  DayOfWeek  X          Y       n_days  Year  WeekOfYear  Month  Hour  Block  PdDistrict_CENTRAL  PdDistrict_INGLESIDE  PdDistrict_MISSION  PdDistrict_NORTHERN  PdDistrict_PARK  PdDistrict_RICHMOND  PdDistrict_SOUTHERN  PdDistrict_TARAVAL  PdDistrict_TENDERLOIN
0        37          2 -122.425892  37.774599    4510  2015          20      5    23      0                   0                     0                   0                    1                0                    0                    0                   0                      0
1        21          2 -122.425892  37.774599    4510  2015          20      5    23      0                   0                     0                   0                    1                0                    0                    0                   0                      0
2        21          2 -122.424363  37.800414    4510  2015          20      5    23      0                   0                     0                   0                    1                0                    0                    0                   0                      0
3        16          2 -122.426995  37.800873    4510  2015          20      5    23      1                   0                     0                   0                    1                0                    0                    0                   0                      0
4        16          2 -122.438738  37.771541    4510  2015          20      5    23      1                   0                     0                   0                    0                1                    0                    0                   0                      0

修改X,Y的列名


def XYrename(data):
    data.rename(columns={'X':'lat', 'Y':'lon'}, inplace = True)
    return data
XYrename(data_train)
XYrename(data_test)
print(data_test.head())

在这里插入图片描述

合并X,y,处理地理坐标点

all_data = pd.concat((data_train,data_test),ignore_index=True)

print(all_data['lat'].min(),all_data['lat'].max())
print(all_data['lon'].min(),all_data['lon'].max())
-122.51364209999998 -120.5
37.70787902 90.0

为了改善模型,扩展地理特征

关于PCA降低维度

【机器学习】Sklearn库主成分分析PCA降维的运用实战

pca = PCA(n_components=2)
pca.fit(data_train[['lat','lon']])
XYt = pca.transform(data_train[['lat','lon']])
print(XYt)
[[ 0.00345383  0.00340625]
 [ 0.00345383  0.00340625]
 [ 0.02930857  0.00284016]
 ...
 [ 0.00995497 -0.01886836]
 [ 0.01077519 -0.03170572]
 [-0.03175461 -0.02889356]]

将主成分分析和高斯混合聚类运用到地理坐标中扩展维度改善模型表现

高斯混淆聚类的一些介绍:
机器学习(17)——GMM算法
高斯混合模型(Gaussian mixture models)

from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
pca = PCA(n_components=2)
pca.fit(data_train[['lat','lon']])
XYt = pca.transform(data_train[['lat','lon']])
data_train['Spatialpca1'] = XYt[:,0]
data_train['Spatialpca2'] = XYt[:,1]
clf = GaussianMixture(n_components=150,covariance_type='diag',random_state=0).fit(data_train[['lat','lon']])
data_train['Spatialpcacluster'] = clf.predict(data_train[['lat','lon']])
print(data_train.Spatialpcacluster.head())

结果:

0     85
1     85
2      8
3      8
4    147
Name: Spatialpcacluster, dtype: int64

DEMO(写一个函数,方便train和test数据集的转换)

from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture


def Spatialtransfer(dataset):
    lan_median = dataset[dataset['lat']<-120.5]['lat'].median()
    lon_median = dataset[dataset['lon']<90]['lon'].median()
    dataset.loc[dataset['lat']>=-120.5,'lan']= lan_median
    dataset.loc[dataset['lon']>=90,'lon']= lon_median
    dataset['lat+lon']=dataset['lat']+dataset['lon']
    dataset['lat-lon']=dataset['lat']-dataset['lon']
    dataset['Spatial30_1'] = dataset['lat']*np.cos(np.pi/6)+dataset['lon']*np.sin(np.pi/6)
    dataset['Spatial30_2'] = dataset['lon']*np.cos(np.pi/6)-dataset['lan']*np.sin(np.pi/6)
    dataset['Spatial60_1'] = dataset['lat']*np.cos(np.pi/3)+dataset['lon']*np.sin(np.pi/3)
    dataset['Spatial60_2'] = dataset['lon']*np.cos(np.pi/3)-dataset['lan']*np.sin(np.pi/3)
    dataset['Spatial1'] = (dataset['lat'] - dataset['lan'].min()) ** 2 + (dataset['lon'] - dataset['lon'].min()) ** 2
    dataset['Spatial2'] = (dataset['lat'].max() - dataset['lan']) ** 2 + (dataset['lon'] - dataset['lon'].min()) ** 2
    dataset['Spatial3'] = (dataset['lat'] - dataset['lan'].min()) ** 2 + (dataset['lon'].max() - dataset['lon']) ** 2
    dataset['Spatial4'] = (dataset['lat'].max() - dataset['lan']) ** 2 + (dataset['lon'].max() - dataset['lon']) ** 2
    dataset['Spatial5'] = (dataset['lat'] - lan_median) ** 2 + (dataset['lon'] - lon_median) ** 2
    pca = PCA(n_components=2)
    pca.fit(dataset[['lat','lon']])
    XYt = pca.transform(dataset[['lat','lon']])
    dataset['Spatialpca1'] = XYt[:,0]
    dataset['Spatialpca2'] = XYt[:,1]
    clf = GaussianMixture(n_components=150,covariance_type='diag',random_state=0).fit(dataset[['lat','lon']])
    dataset['Spatialpcacluster'] = clf.predict(dataset[['lat','lon']])
    return dataset

Spatialtransfer(data_train)
print(data_train.head())

成功输出:

Category  DayOfWeek         lat        lon  n_days  Year  WeekOfYear  Month  Hour  Block  PdDistrict_CENTRAL  PdDistrict_INGLESIDE  PdDistrict_MISSION  PdDistrict_NORTHERN  PdDistrict_PARK  PdDistrict_RICHMOND  PdDistrict_SOUTHERN  PdDistrict_TARAVAL  PdDistrict_TENDERLOIN  lan    lat+lon     lat-lon  Spatial30_1  Spatial30_2  Spatial60_1  Spatial60_2  Spatial1  Spatial2  Spatial3  Spatial4  Spatial5  Spatialpca1  Spatialpca2  Spatialpcacluster
0        37          2 -122.425892  37.774599    4510  2015          20      5    23      0                   0                     0                   0                    1                0                    0                    0                   0                      0  NaN -84.651293 -160.200490   -87.136633          NaN   -28.499184          NaN  0.004541       NaN  0.002149       NaN  0.000090    -0.001242    -0.008148                 83
1        21          2 -122.425892  37.774599    4510  2015          20      5    23      0                   0                     0                   0                    1                0                    0                    0                   0                      0  NaN -84.651293 -160.200490   -87.136633          NaN   -28.499184          NaN  0.004541       NaN  0.002149       NaN  0.000090    -0.001242    -0.008148                 83
2        21          2 -122.424363  37.800414    4510  2015          20      5    23      0                   0                     0                   0                    1                0                    0                    0                   0                      0  NaN -84.623949 -160.224777   -87.122401          NaN   -28.476062          NaN  0.008626       NaN  0.000446       NaN  0.000688     0.006807    -0.032724                101
3        16          2 -122.426995  37.800873    4510  2015          20      5    23      1                   0                     0                   0                    1                0                    0                    0                   0                      0  NaN -84.626123 -160.227868   -87.124452          NaN   -28.476982          NaN  0.008760       NaN  0.000477       NaN  0.000760     0.004378    -0.033838                101
4        16          2 -122.438738  37.771541    4510  2015          20      5    23      1                   0                     0                   0                    0                1                    0                    0                   0                      0  NaN -84.667196 -160.210279   -87.149287          NaN   -28.508255          NaN  0.004551       NaN  0.002844       NaN  0.000513    -0.014443    -0.008461                 88

还有些小错误(没有把‘lan’改成’lat’导致的空值,下午改。函数没问题的)

pycharm的查找与替换:Ctrl + R 替换
PyCharm中批量查找及替换

修改后:

from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture


def Spatialtransfer(dataset):
    lat_median = dataset[dataset['lat']<-120.5]['lat'].median()
    lon_median = dataset[dataset['lon']<90]['lon'].median()
    dataset.loc[dataset['lat']>=-120.5,'lat']= lat_median
    dataset.loc[dataset['lon']>=90,'lon']= lon_median
    dataset['lat+lon']=dataset['lat']+dataset['lon']
    dataset['lat-lon']=dataset['lat']-dataset['lon']
    dataset['Spatial30_1'] = dataset['lat']*np.cos(np.pi/6)+dataset['lon']*np.sin(np.pi/6)
    dataset['Spatial30_2'] = dataset['lon']*np.cos(np.pi/6)-dataset['lat']*np.sin(np.pi/6)
    dataset['Spatial60_1'] = dataset['lat']*np.cos(np.pi/3)+dataset['lon']*np.sin(np.pi/3)
    dataset['Spatial60_2'] = dataset['lon']*np.cos(np.pi/3)-dataset['lat']*np.sin(np.pi/3)
    dataset['Spatial1'] = (dataset['lat'] - dataset['lat'].min()) ** 2 + (dataset['lon'] - dataset['lon'].min()) ** 2
    dataset['Spatial2'] = (dataset['lat'].max() - dataset['lat']) ** 2 + (dataset['lon'] - dataset['lon'].min()) ** 2
    dataset['Spatial3'] = (dataset['lat'] - dataset['lat'].min()) ** 2 + (dataset['lon'].max() - dataset['lon']) ** 2
    dataset['Spatial4'] = (dataset['lat'].max() - dataset['lat']) ** 2 + (dataset['lon'].max() - dataset['lon']) ** 2
    dataset['Spatial5'] = (dataset['lat'] - lat_median) ** 2 + (dataset['lon'] - lon_median) ** 2
    pca = PCA(n_components=2)
    pca.fit(dataset[['lat','lon']])
    XYt = pca.transform(dataset[['lat','lon']])
    dataset['Spatialpca1'] = XYt[:,0]
    dataset['Spatialpca2'] = XYt[:,1]
    clf = GaussianMixture(n_components=150,covariance_type='diag',random_state=0).fit(dataset[['lat','lon']])
    dataset['Spatialpcacluster'] = clf.predict(dataset[['lat','lon']])
    return dataset

Spatialtransfer(data_train)
print(data_train.head())

封装数据处理的结果并输出

data_train.to_csv("C:\\Users\\Nihil\\Documents\\pythonlearn\\data\\Results\\Crimedatatrain2.csv")
data_test.to_csv("C:\\Users\\Nihil\\Documents\\pythonlearn\\data\\Results\\Crimedatatest2.csv")

查看一下数据类型

RangeIndex: 878049 entries, 0 to 878048
Data columns (total 33 columns):
Category                 878049 non-null int64
DayOfWeek                878049 non-null int64
lat                      878049 non-null float64
lon                      878049 non-null float64
n_days                   878049 non-null int64
Year                     878049 non-null int64
WeekOfYear               878049 non-null int64
Month                    878049 non-null int64
Hour                     878049 non-null int64
Block                    878049 non-null int64
PdDistrict_CENTRAL       878049 non-null int64
PdDistrict_INGLESIDE     878049 non-null int64
PdDistrict_MISSION       878049 non-null int64
PdDistrict_NORTHERN      878049 non-null int64
PdDistrict_PARK          878049 non-null int64
PdDistrict_RICHMOND      878049 non-null int64
PdDistrict_SOUTHERN      878049 non-null int64
PdDistrict_TARAVAL       878049 non-null int64
PdDistrict_TENDERLOIN    878049 non-null int64
lat+lon                  878049 non-null float64
lat-lon                  878049 non-null float64
Spatial30_1              878049 non-null float64
Spatial30_2              878049 non-null float64
Spatial60_1              878049 non-null float64
Spatial60_2              878049 non-null float64
Spatial1                 878049 non-null float64
Spatial2                 878049 non-null float64
Spatial3                 878049 non-null float64
Spatial4                 878049 non-null float64
Spatial5                 878049 non-null float64
Spatialpca1              878049 non-null float64
Spatialpca2              878049 non-null float64
Spatialpcacluster        878049 non-null int64
dtypes: float64(15), int64(18)
memory usage: 221.1 MB

用CatBoostClassfier

X = data_train.drop(['Category'], axis=1)
y = data_train.Category

categorical_features_indices = np.where(X.dtypes != np.float)[0]

from sklearn.model_selection import train_test_split
X_train,X_validation,y_train,y_validation = train_test_split(X,y,random_state=42,train_size=0.3)


from catboost import CatBoostClassifier
from catboost import Pool
from catboost import cv
from sklearn.metrics import accuracy_score

model = CatBoostClassifier(eval_metric='Accuracy',use_best_model=True,random_seed=42)
model.fit(X_train,y_train,cat_features=categorical_features_indices,eval_set=(X_validation,y_validation))

算得太慢了就先不算完了。