function [beta] = sc_approx_pooling(feaSet, B, pyramid, gamma, knn) %================================================ % % Usage: % Compute the linear spatial pyramid feature using sparse coding. % % Inputss: % feaSet -structure defining the feature set of an image % .feaArr local feature array extracted from the % image, column-wise % .x x locations of each local feature, 2nd % dimension of the matrix % .y y locations of each local feature, 1st % dimension of the matrix % .width width of the image % .height height of the image % B -sparse dictionary, column-wise % pyramid -defines structure of pyramid % gamma -sparsity regularization parameter % knn -k nearest neighbors selected for sparse coding % % Output: % beta -multiscale max pooling feature % % Written by Jianchao Yang @ NEC Research Lab America (Cupertino) % Mentor: Kai Yu % July 2008 % % Revised May. 2010 %=============================================== dSize = size(B, 2); nSmp = size(feaSet.feaArr, 2); img_width = feaSet.width; img_height = feaSet.height; idxBin = zeros(nSmp, 1); sc_codes = zeros(dSize, nSmp); % compute the local feature for each local feature D = feaSet.feaArr'*B; IDX = zeros(nSmp, knn); for ii = 1:nSmp, d = D(ii, :); [dummy, idx] = sort(d, 'descend'); IDX(ii, = idx(1:knn); end for ii = 1:nSmp, y = feaSet.feaArr(:, ii); idx = IDX(ii, :); BB = B(:, idx); sc_codes(idx, ii) = feature_sign(BB, y, 2*gamma); %sc_codes:sparse coding coefficient(S) X=BS end sc_codes = abs(sc_codes); % spatial levels pLevels = length(pyramid); % spatial bins on each level pBins = pyramid.^2; % total spatial bins tBins = sum(pBins); beta = zeros(dSize, tBins); bId = 0; for iter1 = 1:pLevels, nBins = pBins(iter1); %[1 4 16] wUnit = img_width / pyramid(iter1); hUnit = img_height / pyramid(iter1); % find to which spatial bin each local descriptor belongs xBin = ceil(feaSet.x / wUnit); %feaSet.x 960*1 vector yBin = ceil(feaSet.y / hUnit); %feaSet.y 960*1 vector idxBin = (yBin - 1)*pyramid(iter1) + xBin; for iter2 = 1:nBins, bId = bId + 1; sidxBin = find(idxBin == iter2) if isempty(sidxBin), continue; end beta(:, bId) = max(sc_codes(:, sidxBin), [], 2); %max(s, [], 2):max for each row; beta:have 1+4+16 columns, zj=max(|sj1|,|sj2|..|sjm|) X=BS where X:n*m B:n*k S:k*m end end if bId ~= tBins, error('Index number error!'); end beta = beta(:); beta = beta./sqrt(sum(beta.^2));
有两点需要注意:
1)在求解beta时,作者将原始图像用spatial pyramid划分为3 levels,相应的cells个数分别为1,4,16。在论文中由于X=UV(U为稀疏编码,V为字典),所以
但是在代码中,作者是按照X=BS(B为字典,S为稀疏编码)求解的,所以
2)在论文中作者说z和U有相同的列数,但在代码中作者是在3个level的每一个cell上求最大值,因此z的列数为1+4+16=21。
function [beta] = sc_approx_pooling(feaSet, B, pyramid, gamma, knn) %================================================ % % Usage: % Compute the linear spatial pyramid feature using sparse coding. % % Inputss: % feaSet -structure defining the feature set of an image % .feaArr local feature array extracted from the % image, column-wise % .x x locations of each local feature, 2nd % dimension of the matrix % .y y locations of each local feature, 1st % dimension of the matrix % .width width of the image % .height height of the image % B -sparse dictionary, column-wise % pyramid -defines structure of pyramid % gamma -sparsity regularization parameter % knn -k nearest neighbors selected for sparse coding % % Output: % beta -multiscale max pooling feature % % Written by Jianchao Yang @ NEC Research Lab America (Cupertino) % Mentor: Kai Yu % July 2008 % % Revised May. 2010 %=============================================== dSize = size(B, 2); nSmp = size(feaSet.feaArr, 2); img_width = feaSet.width; img_height = feaSet.height; idxBin = zeros(nSmp, 1); sc_codes = zeros(dSize, nSmp); % compute the local feature for each local feature D = feaSet.feaArr'*B; IDX = zeros(nSmp, knn); for ii = 1:nSmp, d = D(ii, :); [dummy, idx] = sort(d, 'descend'); IDX(ii, = idx(1:knn); end for ii = 1:nSmp, y = feaSet.feaArr(:, ii); idx = IDX(ii, :); BB = B(:, idx); sc_codes(idx, ii) = feature_sign(BB, y, 2*gamma); end sc_codes = abs(sc_codes); % spatial levels pLevels = length(pyramid); % spatial bins on each level pBins = pyramid.^2; % total spatial bins tBins = sum(pBins); beta = zeros(dSize, tBins); bId = 0; for iter1 = 1:pLevels, nBins = pBins(iter1); wUnit = img_width / pyramid(iter1); hUnit = img_height / pyramid(iter1); % find to which spatial bin each local descriptor belongs xBin = ceil(feaSet.x / wUnit); yBin = ceil(feaSet.y / hUnit); idxBin = (yBin - 1)*pyramid(iter1) + xBin; for iter2 = 1:nBins, bId = bId + 1; sidxBin = find(idxBin == iter2) if isempty(sidxBin), continue; end beta(:, bId) = max(sc_codes(:, sidxBin), [], 2);%求每行的最大值 end end if bId ~= tBins, error('Index number error!'); end beta = beta(:); beta = beta./sqrt(sum(beta.^2));
function [beta] = sc_pooling(feaSet, B, pyramid, gamma) %================================================ % % Usage: % Compute the linear spatial pyramid feature using sparse coding. % % Inputss: % feaSet -structure defining the feature set of an image % .feaArr local feature array extracted from the % image, column-wise % .x x locations of each local feature, 2nd % dimension of the matrix % .y y locations of each local feature, 1st % dimension of the matrix % .width width of the image % .height height of the image % B -sparse dictionary, column-wise % gamma -sparsity regularization parameter % pyramid -defines structure of pyramid % % Output: % beta -multiscale max pooling feature % % Written by Jianchao Yang @ NEC Research Lab America (Cupertino) % Mentor: Kai Yu % July 2008 % % Revised May. 2010 %=============================================== dSize = size(B, 2); nSmp = size(feaSet.feaArr, 2); img_width = feaSet.width; img_height = feaSet.height; idxBin = zeros(nSmp, 1); sc_codes = zeros(dSize, nSmp); % compute the local feature for each local feature beta = 1e-4; A = B'*B + 2*beta*eye(dSize); Q = -B'*feaSet.feaArr; for iter1 = 1:nSmp, sc_codes(:, iter1) = L1QP_FeatureSign_yang(gamma, A, Q(:, iter1)); %即Zj={u1j,u2j,...uMj},M: size of the dictionary end sc_codes = abs(sc_codes); % spatial levels pLevels = length(pyramid); % spatial bins on each level pBins = pyramid.^2; %[1,4,16] % total spatial bins tBins = sum(pBins); beta = zeros(dSize, tBins); bId = 0; for iter1 = 1:pLevels, nBins = pBins(iter1); wUnit = img_width / pyramid(iter1); hUnit = img_height / pyramid(iter1); % find to which spatial bin each local descriptor belongs xBin = ceil(feaSet.x / wUnit); yBin = ceil(feaSet.y / hUnit); idxBin = (yBin - 1)*pyramid(iter1) + xBin; for iter2 = 1:nBins, bId = bId + 1; sidxBin = find(idxBin == iter2); if isempty(sidxBin), continue; end beta(:, bId) = max(sc_codes(:, sidxBin), [], 2); %max(x,[],1)按列求最大值,max(x,[],2)按行求最大值 end end if bId ~= tBins, error('Index number error!'); end beta = beta(:); beta = beta./sqrt(sum(beta.^2));
main函数
% This is an example code for running the ScSPM algorithm described in "Linear % Spatial Pyramid Matching using Sparse Coding for Image Classification" (CVPR'09) % % Written by Jianchao Yang @ IFP UIUC % For any questions, please email to jyang29@ifp.illinois.edu. % % Revised May, 2010 by Jianchao Yang clear all; clc; %% set path addpath('large_scale_svm'); addpath('sift'); addpath(genpath('sparse_coding')); %% parameter setting % directory setup img_dir = 'image'; % directory for dataset images data_dir = 'data'; % directory to save the sift features of the chosen dataset dataSet = 'Caltech101'; % sift descriptor extraction skip_cal_sift = true; % if 'skip_cal_sift' is false, set the following parameter gridSpacing = 6; patchSize = 16; maxImSize = 300; nrml_threshold = 1; % low contrast region normalization threshold (descriptor length) % dictionary training for sparse coding skip_dic_training = true; %have provided dict_Caltech101_1024,so skip dic training nBases = 1024; nsmp = 200000; beta = 1e-5; % a small regularization for stablizing sparse coding num_iters = 50; % feature pooling parameters pyramid = [1, 2, 4]; % spatial block number on each level of the pyramid gamma = 0.15; knn = 200; % find the k-nearest neighbors for approximate sparse coding % if set 0, use the standard sparse coding % classification test on the dataset nRounds = 5; % number of random tests lambda = 0.1; % regularization parameter for w tr_num = 30; % training number per category rt_img_dir = fullfile(img_dir, dataSet); rt_data_dir = fullfile(data_dir, dataSet); %% calculate sift features or retrieve the database directory if skip_cal_sift, database = retr_database_dir(rt_data_dir); else [database, lenStat] = CalculateSiftDescriptor(rt_img_dir, rt_data_dir, gridSpacing, patchSize, maxImSize, nrml_threshold); end; %% load sparse coding dictionary (one dictionary trained on Caltech101 is provided: dict_Caltech101_1024.mat) Bpath = ['dictionary/dict_' dataSet '_' num2str(nBases) '.mat']; Xpath = ['dictionary/rand_patches_' dataSet '_' num2str(nsmp) '.mat']; if ~skip_dic_training, try load(Xpath); catch X = rand_sampling(database, nsmp);%如果没有提供patches,则需自己从database中提取nsmp个patches save(Xpath, 'X'); end [B, S, stat] = reg_sparse_coding(X, nBases, eye(nBases), beta, gamma, num_iters); save(Bpath, 'B', 'S', 'stat'); else load(Bpath); end nBases = size(B, 2); % size of the dictionary %% calculate the sparse coding feature dimFea = sum(nBases*pyramid.^2); %每张图片特征的维数 1024(1+4+16) numFea = length(database.path); %图片个数,database.path:contain the pathes for each image of each class sc_fea = zeros(dimFea, numFea);%每列一个特征,共numFea个sift特征 sc_label = zeros(numFea, 1); disp('=================================================='); fprintf('Calculating the sparse coding feature...\n'); fprintf('Regularization parameter: %f\n', gamma); disp('=================================================='); for iter1 = 1:numFea, if ~mod(iter1, 50), fprintf('.\n'); else fprintf('.'); end; fpath = database.path{iter1}; load(fpath); if knn, sc_fea(:, iter1) = sc_approx_pooling(feaSet, B, pyramid, gamma, knn); else sc_fea(:, iter1) = sc_pooling(feaSet, B, pyramid, gamma); end sc_label(iter1) = database.label(iter1); end; %% evaluate the performance of the computed feature using linear SVM [dimFea, numFea] = size(sc_fea); clabel = unique(sc_label); nclass = length(clabel); accuracy = zeros(nRounds, 1); for ii = 1:nRounds, fprintf('Round: %d...\n', ii); tr_idx = []; ts_idx = []; for jj = 1:nclass, idx_label = find(sc_label == clabel(jj)); num = length(idx_label); idx_rand = randperm(num); tr_idx = [tr_idx; idx_label(idx_rand(1:tr_num))]; ts_idx = [ts_idx; idx_label(idx_rand(tr_num+1:end))]; end; tr_fea = sc_fea(:, tr_idx); tr_label = sc_label(tr_idx); ts_fea = sc_fea(:, ts_idx); ts_label = sc_label(ts_idx); [w, b, class_name] = li2nsvm_multiclass_lbfgs(tr_fea', tr_label, lambda); % train multi-class via lbfgs, one-against-all decomposition [C, Y] = li2nsvm_multiclass_fwd(ts_fea', w, b, class_name); % make multi-class prediction acc = zeros(length(class_name), 1); for jj = 1 : length(class_name) c = class_name(jj); idx = find(ts_label == c); curr_pred_label = C(idx); curr_gnd_label = ts_label(idx); acc(jj) = length(find(curr_pred_label == curr_gnd_label))/length(idx); end; accuracy(ii) = mean(acc); end; fprintf('Mean accuracy: %f\n', mean(accuracy)); fprintf('Standard deviation: %f\n', std(accuracy));