Transfer Component Analyze (TCA)

TCA

TCA属于基于特征的迁移学习方法。那么,它做了一件什么事呢?用通俗的语言来说,跟PCA很像:PCA是一个大矩阵进去,一个小矩阵出来,TCA呢,是两个大矩阵进去,两个小矩阵出来。从学术角度讲,TCA针对domain adaptation问题中,源域和目标域处于不同数据分布时,将两个领域的数据一起映射到一个高维的再生核希尔伯特空间。在此空间中,最小化源和目标的数据距离,同时最大程度地保留它们各自的内部属性。直观地理解就是,在现在这个维度上不好最小化它们的距离,那么我就找个映射,在映射后的空间上让它们最接近,那么我不就可以进行分类了吗。

1
2
3
4
5
6
7
8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import sklearn.metrics
import scipy
import warnings
warnings.filterwarnings('ignore')

本文是在阿里天池实验室平台上实现的,数据集是我自己随机生成的模拟数据集,数据集在阿里天池平台,文末关注微信公众号【一切皆可解读】,回复关键词【XZ002】查看或下载。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
source = pd.read_csv('datalab/48363/TCA_source.csv', sep = ',').values
target = pd.read_csv('datalab/48363/TCA_target.csv', sep = ',').values

plt.figure()
plt.scatter(source[0:100,2],source[0:100,3], c='',marker='o', alpha=0.7, edgecolors='r')
plt.scatter(source[100:,2],source[100:,3], c='',marker='*', alpha=0.7, edgecolors='r')
plt.scatter(target[0:100,2],target[0:100,3], c='',marker='^', alpha=0.7, edgecolors='b')
plt.scatter(target[100:,2],target[100:,3], c='',marker='s', alpha=0.7, edgecolors='b')
plt.xlabel('x1')
plt.ylabel('x2')
plt.legend(('Pos. Source', 'Neg. Source', 'Pos. Target', 'Neg. Target'))
```
![png](http://img.chsong.live/Blogs/TCA/2.png-o)
```python
Source_X = source[:, 2:4]
Source_Y = source[:,1]
Target_X = target[:, 2:4]
Target_Y = target[:, 1]
print(Source_X.shape, Source_Y.shape, Target_X.shape, Target_Y.shape)
(200, 2) (200,) (200, 2) (200,)
1
2
3
4
5
6
7
8
9
10
11
12
'''PCA'''
# center matrix for cov
ns = nt = 200
n = ns + nt
H = np.eye(n) - (1/n) * np.ones((n,n))
All_X = np.concatenate((Source_X, Target_X), axis = 0)
# All_X = (All_X - All_X.mean(axis=0)) / All_X.std(axis=0) #归一化操作,可加可不加
XHX = All_X.T.dot(H).dot(All_X)
eig_values, eig_vectors = np.linalg.eig(XHX)
print(eig_vectors)
W = eig_vectors[:,1][:,np.newaxis]
print(W)
[[-0.82451726 -0.56583681]
 [ 0.56583681 -0.82451726]]
[[-0.56583681]
 [-0.82451726]]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
All_X_PCA = W.T.dot(All_X.T).squeeze()
#################两个领域整体数据分布
plt.figure()
S_n, S_bins, S_patches = plt.hist(All_X_PCA[0:200], bins=50, alpha = 0.5)
T_n, T_bins, T_patches = plt.hist(All_X_PCA[200: ], bins=50, alpha = 0.5)
plt.close()

plt.figure()
plt.title('Principle Component Analysis')
plt.xticks([])
plt.yticks([])
plt.ylabel('PDF')
plt.xlabel('x')
S_mu, S_sigma = All_X_PCA[0:200].mean(), All_X_PCA[0:200].std()
T_mu, T_sigma = All_X_PCA[200: ].mean(), All_X_PCA[200: ].std()
S_y, T_y = mlab.normpdf(S_bins, S_mu, S_sigma), mlab.normpdf(T_bins, T_mu, T_sigma)

plt.plot(S_bins, S_y, 'r-', alpha = 0.5)
plt.plot(T_bins, T_y, 'b-', alpha = 0.5)

##################两个领域,正负类别数据的分布
plt.figure()
S_P_n, S_P_bins, S_P_patches = plt.hist(All_X_PCA[0:100], bins=50, alpha = 0.5)
S_N_n, S_N_bins, S_N_patches = plt.hist(All_X_PCA[100:200], bins=50, alpha = 0.5)
T_P_n, T_P_bins, T_P_patches = plt.hist(All_X_PCA[200:300], bins=50, alpha = 0.5)
T_N_n, T_N_bins, T_N_patches = plt.hist(All_X_PCA[300: ], bins=50, alpha = 0.5)
plt.close()

S_P_mu, S_P_sigma = All_X_PCA[0:100].mean(), All_X_PCA[0:100].std()
S_N_mu, S_N_sigma = All_X_PCA[100:200].mean(), All_X_PCA[100:200].std()
T_P_mu, T_P_sigma = All_X_PCA[200:300].mean(), All_X_PCA[200:300].std()
T_N_mu, T_N_sigma = All_X_PCA[300: ].mean(), All_X_PCA[300: ].std()
S_P_y, T_P_y = mlab.normpdf(S_P_bins, S_P_mu, S_P_sigma), mlab.normpdf(T_P_bins, T_P_mu, T_P_sigma)
S_N_y, T_N_y = mlab.normpdf(S_N_bins, S_N_mu, S_N_sigma), mlab.normpdf(T_N_bins, T_N_mu, T_N_sigma)

plt.plot(S_P_bins, S_P_y/10, 'r+', alpha = 0.4)
plt.plot(S_N_bins, S_N_y/10, 'rx', alpha = 0.4)
plt.plot(T_P_bins, T_P_y/10, 'b+', alpha = 0.4)
plt.plot(T_N_bins, T_N_y/10, 'bx', alpha = 0.4)
plt.legend(['Source', 'Target', 'Pos. Source', 'Neg. Source', 'Pos. Target', 'Neg. Target'])
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
All_X = All_X.reshape(-1, n)
All_X /= np.linalg.norm(All_X, axis=0)
'''
Gaussian Kernel
X: mxn
m: num of features
n: num of instances
'''
def kernel(kernel_type, X, gamma = 1):
if kernel_type == 'linear':
K = np.dot(X.T, X)
elif kernel_type == 'rbf':
D = np.sum(X.T**2, axis=1).reshape(n,-1).dot(np.ones((1,n))) + \
np.ones((n, 1)).dot(np.sum(X.T**2, axis=1).reshape(-1,n)) - 2*X.T.dot(X)
K = np.exp(-gamma * D)
return K

'''Construct Kernel Matrix K of All_X'''
K = kernel('rbf', All_X, gamma = 1)

'''Construct Matrix L'''
e = np.concatenate((np.ones(ns)/ns, -np.ones(nt)/nt)).reshape(n,-1)
L = e.dot(e.T)
L /= np.linalg.norm(L)

'''Construct the Matrix that need to be decomposed'''
lamda = 1
M = np.linalg.pinv(K.dot(L).dot(K.T) + lamda * np.eye(n)).dot(K).dot(H).dot(K.T)

eig_values, eig_vectors = np.linalg.eig(M)
ind = np.argsort(eig_values)
A = eig_vectors[:,ind[-1]].reshape(-1,1)
All_X_TCA = A.T.dot(K).squeeze()

plt.figure()
S_n, S_bins, S_patches = plt.hist(All_X_TCA[0:200], bins=50, alpha = 0.5)
T_n, T_bins, T_patches = plt.hist(All_X_TCA[200: ], bins=50, alpha = 0.5)
plt.close()

plt.figure()
plt.title('Transfer Component Analysis')
plt.xticks([])
plt.yticks([])
plt.ylabel('PDF')
plt.xlabel('x')
S_mu, S_sigma = All_X_TCA[0:200].mean(), All_X_TCA[0:200].std()
T_mu, T_sigma = All_X_TCA[200: ].mean(), All_X_TCA[200: ].std()
S_y, T_y = mlab.normpdf(S_bins, S_mu, S_sigma), mlab.normpdf(T_bins, T_mu, T_sigma)

plt.plot(S_bins, S_y, 'r-', alpha = 0.5)
plt.plot(T_bins, T_y, 'b-', alpha = 0.5)

##################两个领域,正负类别数据的分布
plt.figure()
S_P_n, S_P_bins, S_P_patches = plt.hist(All_X_TCA[0:100], bins=50, alpha = 0.5)
S_N_n, S_N_bins, S_N_patches = plt.hist(All_X_TCA[100:200], bins=50, alpha = 0.5)
T_P_n, T_P_bins, T_P_patches = plt.hist(All_X_TCA[200:300], bins=50, alpha = 0.5)
T_N_n, T_N_bins, T_N_patches = plt.hist(All_X_TCA[300: ], bins=50, alpha = 0.5)
plt.close()

S_P_mu, S_P_sigma = All_X_TCA[0:100].mean(), All_X_TCA[0:100].std()
S_N_mu, S_N_sigma = All_X_TCA[100:200].mean(), All_X_TCA[100:200].std()
T_P_mu, T_P_sigma = All_X_TCA[200:300].mean(), All_X_TCA[200:300].std()
T_N_mu, T_N_sigma = All_X_TCA[300: ].mean(), All_X_TCA[300: ].std()
S_P_y, T_P_y = mlab.normpdf(S_P_bins, S_P_mu, S_P_sigma), mlab.normpdf(T_P_bins, T_P_mu, T_P_sigma)
S_N_y, T_N_y = mlab.normpdf(S_N_bins, S_N_mu, S_N_sigma), mlab.normpdf(T_N_bins, T_N_mu, T_N_sigma)

plt.plot(S_P_bins, S_P_y/10, 'r+', alpha = 0.4)
plt.plot(S_N_bins, S_N_y/10, 'rx', alpha = 0.4)
plt.plot(T_P_bins, T_P_y/10, 'b+', alpha = 0.4)
plt.plot(T_N_bins, T_N_y/10, 'bx', alpha = 0.4)
plt.legend(['Source', 'Target', 'Pos. Source', 'Neg. Source', 'Pos. Target', 'Neg. Target'])
plt.show()

png

~赞~

么么哒,请我喝杯咖啡吧~

支付宝
微信