|
a |
|
b/combinedDeepLearningActiveContour/functions/sparseAutoencoderCost.m |
|
|
1 |
function [cost,grad] = sparseAutoencoderCost(theta, visibleSize, hiddenSize, ... |
|
|
2 |
lambda, sparsityParam, beta, data) |
|
|
3 |
% visibleSize: the number of input units (probably 64) |
|
|
4 |
% hiddenSize: the number of hidden units (probably 25) |
|
|
5 |
% lambda: weight decay parameter |
|
|
6 |
% sparsityParam: The desired average activation for the hidden units (denoted in the lecture |
|
|
7 |
% notes by the greek alphabet rho, which looks like a lower-case "p"). |
|
|
8 |
% beta: weight of sparsity penalty term |
|
|
9 |
% data: Our 64x10000 matrix containing the training data. So, data(:,i) is the i-th training example. |
|
|
10 |
|
|
|
11 |
% The input theta is a vector (because minFunc expects the parameters to be a vector). |
|
|
12 |
% We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this |
|
|
13 |
% follows the notation convention of the lecture notes. |
|
|
14 |
|
|
|
15 |
W1 = reshape(theta(1:hiddenSize*visibleSize), hiddenSize, visibleSize); |
|
|
16 |
W2 = reshape(theta(hiddenSize*visibleSize+1:2*hiddenSize*visibleSize), visibleSize, hiddenSize); |
|
|
17 |
b1 = theta(2*hiddenSize*visibleSize+1:2*hiddenSize*visibleSize+hiddenSize); |
|
|
18 |
b2 = theta(2*hiddenSize*visibleSize+hiddenSize+1:end); |
|
|
19 |
|
|
|
20 |
% Cost and gradient variables (your code needs to compute these values). |
|
|
21 |
% Here, we initialize them to zeros. |
|
|
22 |
cost = 0; |
|
|
23 |
W1grad = zeros(size(W1)); |
|
|
24 |
W2grad = zeros(size(W2)); |
|
|
25 |
b1grad = zeros(size(b1)); |
|
|
26 |
b2grad = zeros(size(b2)); |
|
|
27 |
|
|
|
28 |
%% ---------- YOUR CODE HERE -------------------------------------- |
|
|
29 |
% Instructions: Compute the cost/optimization objective J_sparse(W,b) for the Sparse Autoencoder, |
|
|
30 |
% and the corresponding gradients W1grad, W2grad, b1grad, b2grad. |
|
|
31 |
% |
|
|
32 |
% W1grad, W2grad, b1grad and b2grad should be computed using backpropagation. |
|
|
33 |
% Note that W1grad has the same dimensions as W1, b1grad has the same dimensions |
|
|
34 |
% as b1, etc. Your code should set W1grad to be the partial derivative of J_sparse(W,b) with |
|
|
35 |
% respect to W1. I.e., W1grad(i,j) should be the partial derivative of J_sparse(W,b) |
|
|
36 |
% with respect to the input parameter W1(i,j). Thus, W1grad should be equal to the term |
|
|
37 |
% [(1/m) \Delta W^{(1)} + \lambda W^{(1)}] in the last block of pseudo-code in Section 2.2 |
|
|
38 |
% of the lecture notes (and similarly for W2grad, b1grad, b2grad). |
|
|
39 |
% |
|
|
40 |
% Stated differently, if we were using batch gradient descent to optimize the parameters, |
|
|
41 |
% the gradient descent update to W1 would be W1 := W1 - alpha * W1grad, and similarly for W2, b1, b2. |
|
|
42 |
% |
|
|
43 |
|
|
|
44 |
% input data |
|
|
45 |
x=data; |
|
|
46 |
% for autoencoder output y is equal to x |
|
|
47 |
y=x; |
|
|
48 |
|
|
|
49 |
% length of training data |
|
|
50 |
m=size(x,2); |
|
|
51 |
|
|
|
52 |
% Vectorized implementation of forward propagation |
|
|
53 |
z2 = W1 * x + repmat(b1,1,m); |
|
|
54 |
a2 = sigmoid(z2); |
|
|
55 |
z3 = W2 * a2 + repmat(b2,1,m); |
|
|
56 |
h = sigmoid(z3); |
|
|
57 |
|
|
|
58 |
% squared error term |
|
|
59 |
hmy=h-y; |
|
|
60 |
J_sq=sum(sum(hmy.*hmy))/(2*m); |
|
|
61 |
|
|
|
62 |
% weight decay term |
|
|
63 |
J_wd=lambda/(2)*(trace(W1'*W1)+trace(W2'*W2)); |
|
|
64 |
|
|
|
65 |
% sparsity penalty |
|
|
66 |
rho=sparsityParam; |
|
|
67 |
rho_hat=mean(a2,2); |
|
|
68 |
KL=rho.*log(rho./rho_hat)+(1-rho).*log((1-rho)./(1-rho_hat)); |
|
|
69 |
J_sp=beta*sum(KL); |
|
|
70 |
|
|
|
71 |
% cost= J_[squared_error]+J_[weight_decay]+J_[sparsity_penalty] |
|
|
72 |
cost=J_sq+J_wd+J_sp; |
|
|
73 |
|
|
|
74 |
% gradient |
|
|
75 |
a1=x; |
|
|
76 |
delta3 = -(y - h) .* fprime(z3); |
|
|
77 |
sparsity_delta = - rho ./ rho_hat + (1 - rho) ./ (1 - rho_hat); |
|
|
78 |
sd_mat=repmat(sparsity_delta,1,m); |
|
|
79 |
delta2 = (W2'*delta3+beta*sd_mat) .* fprime(z2); |
|
|
80 |
|
|
|
81 |
W2grad = delta3*a2'/m+lambda*W2; |
|
|
82 |
W1grad = delta2*a1'/m+lambda*W1; |
|
|
83 |
|
|
|
84 |
b2grad = delta3 * ones(m,1)/m; |
|
|
85 |
b1grad = delta2* ones(m,1)/m; |
|
|
86 |
|
|
|
87 |
%------------------------------------------------------------------- |
|
|
88 |
% After computing the cost and gradient, we will convert the gradients back |
|
|
89 |
% to a vector format (suitable for minFunc). Specifically, we will unroll |
|
|
90 |
% your gradient matrices into a vector. |
|
|
91 |
grad = [W1grad(:) ; W2grad(:) ; b1grad(:) ; b2grad(:)]; |
|
|
92 |
|
|
|
93 |
end |
|
|
94 |
|
|
|
95 |
%------------------------------------------------------------------- |
|
|
96 |
% Here's an implementation of the sigmoid function, which you may find useful |
|
|
97 |
% in your computation of the costs and the gradients. This inputs a (row or |
|
|
98 |
% column) vector (say (z1, z2, z3)) and returns (f(z1), f(z2), f(z3)). |
|
|
99 |
|
|
|
100 |
function sigm = sigmoid(x) |
|
|
101 |
sigm = 1 ./ (1 + exp(-x)); |
|
|
102 |
end |
|
|
103 |
|
|
|
104 |
% derivative of sigmoid function |
|
|
105 |
% also we can implement it using: f'(x)= f(x)(1-f(x)) if f(x)=sigmoid |
|
|
106 |
function fp = fprime(x) |
|
|
107 |
|
|
|
108 |
fp = exp(-x) ./ (1 + exp(-x)).^2; |
|
|
109 |
end |
|
|
110 |
|
|
|
111 |
|