% README for MIC code % 16 June 2009 % % This file contains all the functions needed to run stepwise regression % using the three coding schemes described in % Paramveer S. Dhillon, Brian Tomasik, Dean Foster, and Lyle Ungar. % "Multi-Task Feature Selection using the Multiple Inclusion Criterion % (MIC)." ECML PKDD '09. % Given an n x m matrix X of features and an n x h matrix Y of responses, % the algorithms can be run as follows: % mic(X,Y,'method','Partial MIC') % mic(X,Y,'method','Full MIC') % mic(X,Y,'method','RIC') % For example: % X = randn(100,5); % w = zeros(5,3); w(2,2) = 1; w(4,1) = -1; % Y = X * w + randn(100,3) * 0.1; % mic(X,Y,'method','Partial MIC'); % Several functions have additional parameters documented below; however, % they were all set to the current default settings for the experiments % in the above-cited paper. % The amount of output can be increased by changing the 'verbosity' setting % from 0 to 1 or 2. function [betaHat YHat] = mic(X, Y, varargin) % Many of the parameters are self-explanatory, but here are some that % need elaboration: % method: 'Partial MIC' uses the log^* k + c_h penalty to code k, the % number of nonzero responses. To use log h to code k instead, set % method to 'partial_logh'. The 'Full MIC' setting is what you'd expect. % 'indep', however, is NOT 'RIC' from the paper: Rather, it uses % the regular MIC likelihood term, just with a diagonal sigmaHat, and % employs h-1 semicolons to distinguish which features go with which % response. To run RIC as described in the paper, use % 'RIC', which calls mic on each response % separately, so that (with h==1) the result will be equivalent to RIC. % hypothesis: Default is 'no'. Otherwise, do the hypothesis version of MIC, % in which we regress on each feature separately. % eyeParam: sigmaHat is a convex combination of the full cov estimate and % a diagonal one. sigmaHat = (1-eyeParam)*(full estimate) + eyeParam * % (diagonal estimate). The default is eyeParam==1 because, by default, we % want sigmaHat to be diagonal. % forceDiagSigmaHat: Pretty much redundant -- this just makes sure that % eyeParam is set to 1 and that we're regularizing sigmaHat using the % convex-combination approach. % useAltSigmaHat: Default is false. If true, recompute sigmaHat each time % we try adding a new feature. Tends to cause overfitting.... % shrinkMethod: Different ways to regularize sigmaHat. 'givenEyeParam' is % the convex-combination method described above and seems to work best. % To see the details of others, look in the documentation for the % sampleCov function. % costPerCoeff: Actually set to 2, though in a roundabout way that first % sets it to DEFAULT (== -3) and then changes it. This is to allow it to % differ for the hypothesis and non-hypothesis versions. % nBestFeaturesCheck: During an iteration of stepwise search, look through % only this number of the best features, where 'best' means 'biggest % improvement in log-likelihood when all responses are added'. % altHypAndDiagSigmaHat: As a warning says below, don't use this: It % causes overfitting. % pForFeatureCost: Will be set to p unless another value is provided. It's % not meant to be changed by an external user, just when we call % ourselves in the hypothesis-code for-loop. % Define constants. NOT_PROVIDED = -1; NONE = -2; DEFAULT = -3; % Parse input. parser = inputParser; parser.FunctionName = 'mic'; parser.addRequired('X', @(x)or(isnumeric(x),islogical(x))); parser.addRequired('Y', @(x)or(isnumeric(x),islogical(x))); parser.addParamValue('method', 'Partial MIC', ... @(x)any(strcmpi(x, {'partial_logh','Partial MIC','Full MIC',... 'indep','RIC'}))); % WARNING: 'indep' is NOT 'Independent' as used in the ICML paper; rather % 'RIC' is. See the explanation in the parameter-definition % section at the beginning of the file. parser.addParamValue('hypothesis', 'no', @(x)any(strcmpi(x, {'no','bonferroni-style', 'simes-style'}))); parser.addParamValue('verbosity', 0, @(x)or(x==0,or(x==1,x==2))); parser.addParamValue('eyeParam', 1, @(x)and(isnumeric(x),x>=0)); % diagonal!! parser.addParamValue('forceDiagSigmaHat', 1, @(x)or(x==true,x==false)); parser.addParamValue('useAltSigmaHat',0,@(x)or(x==0,x==1)); parser.addParamValue('shrinkMethod', 'givenEyeParam', ... @(x)any(strcmpi(x, {'givenEyeParam','LedoitWolf', 'SchaferEtAl'}))); parser.addParamValue('costPerCoeff', DEFAULT); parser.addParamValue('omitInterceptBetaHats', 0, @(x)or(x==0,x==1)); % Use the above for hypothesis versions where we care about which % features are chosen and don't want to deal with intercept. parser.addParamValue('nBestFeaturesCheck',75,@(x)x>0); % set to Inf for maximum search, but slow parser.addParamValue('altHypAndDiagSigmaHat', 0, @(x)or(x==0,x==1)); parser.addParamValue('pForFeatureCost', NOT_PROVIDED) parser.parse(X, Y, varargin{:}); X = parser.Results.X; Y = parser.Results.Y; method = parser.Results.method; hypothesis = parser.Results.hypothesis; % Do hypothesis-testing version? verbosity = parser.Results.verbosity; eyeParam = parser.Results.eyeParam; forceDiagSigmaHat = parser.Results.forceDiagSigmaHat; useAltSigmaHat = parser.Results.useAltSigmaHat; shrinkMethod = parser.Results.shrinkMethod; costPerCoeff = parser.Results.costPerCoeff; omitInterceptBetaHats = parser.Results.omitInterceptBetaHats; nBestFeaturesCheck = parser.Results.nBestFeaturesCheck; altHypAndDiagSigmaHat = parser.Results.altHypAndDiagSigmaHat; pForFeatureCost = parser.Results.pForFeatureCost; % WARNING: Don't use this option because it causes overfitting!! altHypAndDiagSigmaHat = false; % Do hypothesis MIC? if(strcmpi(hypothesis,'no')) % Ordinary stepwise MIC % Set costPerCoeff for non-hypothesis version. if(costPerCoeff==DEFAULT) costPerCoeff = 2; end % Require that exactly the first column of X be 1s. firstColOnes = all(X(:,1)==1); if(~firstColOnes) X = [ones(size(X,1),1) X]; if(omitInterceptBetaHats) warning('Adding a column of 1s to X. It will not show up in betaHat, so Yhat ~= X * betaHat.'); else warning('Adding a column of 1s to X. This will be reflected in the dimensions of betaHat.'); end end assert(~any(all(X(:,2:end)==1)),'No other column should be all 1s.'); % Initialize values. [n,p] = size(X); if(pForFeatureCost==NOT_PROVIDED) pForFeatureCost = p; end [n2,h] = size(Y); assert(n == n2); if(forceDiagSigmaHat || ismember(method,{'indep'})) shrinkMethod = 'givenEyeParam'; eyeParam = 1; end % Create a 'model' struct to store important state. model.X = X; model.Y = Y; model.h = h; model.verbosity = verbosity; model.useAltSigmaHat = useAltSigmaHat; model.altHypAndDiagSigmaHat = altHypAndDiagSigmaHat; model.q = ones(1,h); % # features for each response % Store a method that, given residuals, computes sigmaHat. model.isCorrMatrix = false; % change to 'true' if we ever standardize X and Y model.sigmaHatOfResiduals = ... @(residuals)sampleCov(residuals,'eyeParam',eyeParam,... 'shrinkMethod',shrinkMethod,... 'detMakePosStruct',struct('do',1,'isCorrMatrix',model.isCorrMatrix)); model.sigmaHatIsDiag = and(strcmpi(shrinkMethod,'givenEyeParam'),... eyeParam>.999); % Start computing betaHat. model.betaHat = zeros(p,h); model.betaHat(1,:) = mean(model.Y); % Initial intercept terms. model.curResiduals = model.Y - model.X * model.betaHat; model.curNullSigmaHat = model.sigmaHatOfResiduals(model.curResiduals); model.curNullSigmaHatInv = inv(model.curNullSigmaHat); if(verbosity>1) fprintf('Initial sigmaHat:\n'); initialSigmaHat = model.curNullSigmaHat; initialSigmaHat = initialSigmaHat end % Split between RIC and the other two methods. if(strcmpi(method,'RIC')) % Ignore everything we've done so far, and get betaHat by doing h % separate calls to MIC indep with h==1. for iResponse = 1:h model.betaHat(:,iResponse) = mic(X,Y(:,iResponse),... 'method','indep',... 'hypothesis',hypothesis,... 'costPerCoeff',costPerCoeff-1,... % This is important. We reduce cost per coefficient because 'indep' normally pays 1 bit too many because of semicolons. 'omitInterceptBetaHats',0,... % Not now. Do that at the end of ourselves. 'verbosity',verbosity,... 'eyeParam',eyeParam,... 'forceDiagSigmaHat',forceDiagSigmaHat,... 'useAltSigmaHat',useAltSigmaHat,... 'shrinkMethod',shrinkMethod,... 'altHypAndDiagSigmaHat',altHypAndDiagSigmaHat,... 'nBestFeaturesCheck',nBestFeaturesCheck,... 'pForFeatureCost',pForFeatureCost); end else % Store model coding costs for later use. if(strcmpi(method,'Full MIC')) % We should only ever be looking at costs for the Full MIC model. model.modelCostOfK = repmat(NaN,1,h); model.modelCostOfK(h) = log2(pForFeatureCost) + ... % log2(pForFeatureCost) for the feature cost nonFeatureCodingPenalty(method,h,h,costPerCoeff); else model.modelCostOfK = log2(pForFeatureCost) + ... % log2(pForFeatureCost) for the feature cost arrayfun(@(k)nonFeatureCodingPenalty(method,h,k,costPerCoeff),1:h); end % Initialize coding cost. if(ismember(method,{'indep'})) model.codingCost = h-1; % code the h-1 semicolons else model.codingCost = 0; end % Store info on what features we've used. model.nFeatures = 1; model.featuresNotYetUsed = 2:p; % Below is model with just intercept terms. This assumes no cost to code the % intercept coefficients (cheating...). [model.likelihoodCost shouldEqualBetaHat] = ... likelihoodCostOfResponseSubset(ones(1,h),1,model); assert(all(all(shouldEqualBetaHat-model.betaHat<.00001))); if(verbosity>0) fprintf('\n\n'); fprintf('Method=%s, costPerCoeff=%i\n',method,costPerCoeff); print_diagnostics(model,1,h,NaN,NaN); end % Store previous values of the costs for use in printing out changes % later on. prevCodingCost = model.codingCost; prevLikelihoodCost = model.likelihoodCost; bestCostSoFar = model.codingCost + model.likelihoodCost; % Try adding features until it no longer helps. for nFeatures = 2:p % Find the next best feature. bestFeatureToAdd = NONE; % Choose a set of features to examine to see if we want to add % them. If we don't care about speed, look through all the % remaining features. Otherwise, choose a subset of them. % Sort the unused features by log-likelihood of the Full MIC model % and, if we're optimizing, check the smallest ones only. negLogLikeCosts = arrayfun(@(feature)likelihoodCostOfResponseSubset(ones(1,h),feature,model),model.featuresNotYetUsed); [sortedNegLogLikeCosts indNegLogLikeCosts]=sort(negLogLikeCosts); checkTheseFeatures = model.featuresNotYetUsed(indNegLogLikeCosts(1:min(nBestFeaturesCheck,length(indNegLogLikeCosts)))); % TODO: Sort checkTheseFeatures in increasing order by cost at % that index. % TODO: sort the responses and do similar branch+bound there! % TODO: improve my lower bound on k in the response % branch+bound to be the one in the paper. % Now check the features. if(verbosity>2 && ~strcmpi(method,'Full MIC')) fprintf('\tConsidering feature'); end for curFeature = checkTheseFeatures % TODO: Add branch+bound check to see if even bother with % this feature. Is the lower bound on its description % length (i.e., its residual cost plus lg p) smaller than % the best actual description length seen so far? if(verbosity>2 && ~strcmpi(method,'Full MIC')) fprintf(' %i,',curFeature); end % Get costs if add. if(strcmpi(method,'Full MIC')) [curNegLogLike candidateNewBetaHat] = ... likelihoodCostOfResponseSubset(ones(1,h),curFeature,model); costIfAdded = curNegLogLike + ... model.codingCost + ... model.modelCostOfK(h); else [candidateNewBetaHat costIfAdded curNegLogLike] = ... bestResponseSubset(curFeature,model,bestCostSoFar); end % Is it low enough? if(costIfAdded < bestCostSoFar) bestFeatureToAdd = curFeature; bestCostSoFar = costIfAdded; bestLikelihoodCost = curNegLogLike; bestNewBetaHat = candidateNewBetaHat; if(verbosity>2 && ~strcmpi(method,'Full MIC')) fprintf('\n\t\tNew cost = %f with %i responses for feature %i.\n\tconsidering feature',bestCostSoFar,nnz(bestNewBetaHat(curFeature,:)),curFeature); end end end % Add that feature to the model. if(bestFeatureToAdd~=NONE) % Update state. model.nFeatures = model.nFeatures + 1; model.likelihoodCost = bestLikelihoodCost; model.betaHat = bestNewBetaHat; cur_k = nnz(model.betaHat(bestFeatureToAdd,:)); model.codingCost = model.codingCost + model.modelCostOfK(cur_k); model.curResiduals = model.Y - model.X * model.betaHat; model.curNullSigmaHat = model.sigmaHatOfResiduals(model.curResiduals); model.curNullSigmaHatInv = inv(model.curNullSigmaHat); model.q = arrayfun(@(col)nnz(model.betaHat(:,col)),1:model.h); % Remove the feature from list of unused features. indexOfFeatureToRemove = ... find(model.featuresNotYetUsed==bestFeatureToAdd); assert(length(indexOfFeatureToRemove)==1); model.featuresNotYetUsed(indexOfFeatureToRemove) = []; % Print diagnostics. print_diagnostics(model,bestFeatureToAdd,cur_k,prevLikelihoodCost,prevCodingCost); if(verbosity>1) curSigmaHat = model.curNullSigmaHat; curSigmaHat = curSigmaHat end % Store previous values of the costs for use in printing out changes % later on. prevCodingCost = model.codingCost; prevLikelihoodCost = model.likelihoodCost; assert(abs(bestCostSoFar-(model.codingCost+model.likelihoodCost))<.001); else break; end end end % End of split between 'RIC' and the other two methods. % Return betaHat. betaHat = model.betaHat; % Prepare return results. YHat = X * betaHat; if(omitInterceptBetaHats) betaHat = betaHat(2:end,:); end if(verbosity>1) fprintf('Final sigmaHat:\n'); finalSigmaHat = model.curNullSigmaHat; finalSigmaHat = finalSigmaHat end if(verbosity>0) min_p_and_20 = min(size(betaHat,1),20); fprintf('First %i rows of betaHat:\n',min_p_and_20); firstRowsOfBetaHat = betaHat(1:min_p_and_20,:) end elseif(ismember(hypothesis,{'bonferroni-style','simes-style'})) % Gives FDR's around 0.05-0.1. if(costPerCoeff==DEFAULT) costPerCoeff = 2; end % Should not have a column of 1's in X. assert(~any(all(X==1)),'No column of X should be all 1s.'); % Initialize values. [n,p] = size(X); if(pForFeatureCost==NOT_PROVIDED) pForFeatureCost = p; end [n2,h] = size(Y); assert(n == n2); % Initialize output matrices. betaHat = zeros(p,h); YHat = repmat(NaN,n,h); % There is no single YHat for hypothesis version. % BONFERRONI-STYLE HYPOTHESIS if(strcmpi(hypothesis,'bonferroni-style')) % Fill in each row of betaHat as the result of regressing on just that % x. for iFeature = 1:p fprintf('Doing feature %i.\n',iFeature); cur_X = [ones(size(X,1),1) X(:,iFeature)]; betaHat(iFeature,:) = mic(cur_X,Y,... 'omitInterceptBetaHats',1,... 'hypothesis','no',... 'method',method,... 'costPerCoeff',costPerCoeff,... 'verbosity',0,... 'eyeParam',eyeParam,... 'forceDiagSigmaHat',forceDiagSigmaHat,... 'useAltSigmaHat',useAltSigmaHat,... 'shrinkMethod',shrinkMethod,... 'altHypAndDiagSigmaHat',altHypAndDiagSigmaHat,... 'nBestFeaturesCheck',nBestFeaturesCheck,... 'pForFeatureCost',pForFeatureCost); nnz_cur_feature = nnz(betaHat(iFeature,:)); if(nnz_cur_feature > 0) fprintf('\tPut in %i responses.\n',nnz_cur_feature); end end % SIMES-STYLE HYPOTHESIS elseif(strcmpi(hypothesis,'simes-style')) % Guess the number of features that will be put in. When we guess % right (which must happen eventually), that's our answer. guessed_n_features = 1; while(true) total_feature_cost = codeForIntegers(guessed_n_features, p) + ... log2nChoosek(p,guessed_n_features); amortized_feature_cost = total_feature_cost / guessed_n_features; p_such_that_lg_p_is_cost = 2^amortized_feature_cost; if(verbosity > 0) fprintf('Guessing %i features with amortized cost %.3f.\n',... guessed_n_features,amortized_feature_cost); end betaHat = mic(X,Y,... 'method',method,... 'hypothesis','bonferroni-style',... 'costPerCoeff',costPerCoeff,... 'verbosity',verbosity,... 'eyeParam',eyeParam,... 'forceDiagSigmaHat',forceDiagSigmaHat,... 'useAltSigmaHat',useAltSigmaHat,... 'shrinkMethod',shrinkMethod,... 'altHypAndDiagSigmaHat',altHypAndDiagSigmaHat,... 'nBestFeaturesCheck',nBestFeaturesCheck,... 'pForFeatureCost',p_such_that_lg_p_is_cost); n_features_betaHat = nnz(sum(abs(betaHat),2)); if(verbosity > 0) fprintf('Actually %i features.\n\n',n_features_betaHat); end if(n_features_betaHat==guessed_n_features) break; % We're done; we have the betaHat we want. elseif(n_features_betaHat=0)); parser.addParamValue('shrinkMethod', 'givenEyeParam', ... @(x)any(strcmpi(x, {'givenEyeParam','LedoitWolf', 'SchaferEtAl'}))); parser.addParamValue('subtractEpsMean', 1, @(x)or(x==true,x==false)); parser.addParamValue('detMakePosStruct', struct('do',0,'isCorrMatrix',1)); % This parameter holds a struct whose first field tells whether to % make the resulting sigmaHat have positive determinant and whose second % field tells whether, in the process, we should assume that sigmaHat % represents a correlation matrix. parser.addParamValue('df', NOT_PROVIDED, @(x)and(isnumeric(x),or(x>=0,x==NOT_PROVIDED))); % If you want the best unbiased estimate, take df==# free parameters, % including the intercept to the regression model. parser.parse(epsHat, varargin{:}); epsHat = parser.Results.epsHat; eyeParam = parser.Results.eyeParam; subtractEpsMean = parser.Results.subtractEpsMean; shrinkMethod = parser.Results.shrinkMethod; detMakePosStruct = parser.Results.detMakePosStruct; df = parser.Results.df; % Set df if not given. if(df==NOT_PROVIDED) if(subtractEpsMean) df = 1; else df = 0; end end % Get sizes. [n, h] = size(epsHat); % Subtract mean? Matlab's 'cov' function does. if(subtractEpsMean) epsHat = epsHat - repmat(mean(epsHat),n,1); end % Check df is okay. if df >= n error('df > n; using df = %i',df); end S_h = epsHat' * epsHat / (n-df); % Choose method. switch shrinkMethod case 'givenEyeParam' if(df==1) diagSigmaHat = diag(var(epsHat)); % divide by n-1 else diagSigmaHat = diag(var(epsHat,1)); % divide by n end sigmaHat = (1-eyeParam) * S_h + eyeParam * diagSigmaHat; % Can change target. % Note: Above we're shrinking toward a diagonal matrix with the % *MLE* variances (divide by n, not n-1). This is because a t-test % implicitly uses this type of covariance matrix (because it's a % GLRT). case 'LedoitWolf' % Note: This tends to be pretty conservative in the amount it % shrinks. For instance, on one data set, the resulting sample % covariance matrices had determinant values on the order of those % obtained by taking eyeParam ~= 0.01. m_h = trace(S_h) / h; d_hSq = frobeniusNorm(S_h-m_h*eye(h)); temp = 0; for i = 1:h temp = temp + frobeniusNorm(S_h(:,i) * S_h(:,i)' - S_h); end bBar_hSq = temp / h^2; b_hSq = min(bBar_hSq, d_hSq); a_hSq = d_hSq - b_hSq; sigmaHat = (a_hSq / d_hSq) * S_h + (b_hSq / d_hSq) * m_h * eye(h); case 'SchaferEtAl' % This case took much debugging, but it finally works! I checked it % against the real version of the function in R on two separate % examples, and both produced correct output. The second example % involved lambdaStar and lambda both between 0 and 1, so I didn't % just get lucky by having one of them get set to 1, which might % have hidden errors. % Compute rij and hat variances of rij. % See p. 28 of % http://strimmerlab.org/publications/shrinkcov2005.pdf X = epsHat; centeredX = X - repmat(mean(X),n,1); scaledCenteredX = centeredX ./ repmat(std(X),n,1); % Standardize X first. R = zeros(h,h); VarHatR = zeros(h,h); for i = 1:h for j = 1:h wij = scaledCenteredX(:,i) .* scaledCenteredX(:,j); % wij is n x 1. wBarij = mean(wij); R(i,j) = (n / (n - df)) * wBarij; VarHatR(i,j) = (n / (n-df)^3) * sum((wij - wBarij) .^ 2); end end % Now compute lambdaStar. numerator = sum(sum(VarHatR)) - sum(diag(VarHatR)); RSq = R .^ 2; denominator = sum(sum(RSq)) - sum(diag(RSq)); if denominator==0 lambdaStar = 1; else lambdaStar = numerator / denominator; end if(lambdaStar>1) lambdaStar = 1; end % Compute RStar. RStar = R * min(1,max(0,1-lambdaStar)); for i = 1:h RStar(i,i) = 1; end % Look at sample variances. sigmaHat = zeros(h,h); variances = diag(S_h); % Get variances. target = median(variances); % Compute lambda. centerSq = centeredX .^ 2; numerator2 = sum((n^2 / (n-df)^3) * var(centerSq,1)); % use the divide-by-n variances with var(.,1) denominator2 = sum((sum(centerSq) / (n-df) - target) .^ 2); if denominator2==0 lambda = 1; else lambda = numerator2 / denominator2; end if(lambda > 1) lambda = 1; end % Compute sigmaHat (what the paper calls S*). newVariances = lambda * target + (1-lambda) * variances; newSDs = sqrt(newVariances); for i = 1:h for j = 1:h sigmaHat(i,j) = RStar(i,j) * newSDs(i) * newSDs(j); end end otherwise error('shrinkMethod does not match possibilities.'); end % Check that determinant is positive? if(detMakePosStruct.do) sigmaHat = detMakePos(sigmaHat,detMakePosStruct.isCorrMatrix); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function val = frobeniusNorm(A) % function val = frobeniusNorm(A) % Input: % A: a x b matrix val = trace(A*A') / size(A,1); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function [M detM] = detMakePos(M,isCorrMatrix) % Given a square matrix M, check that its determinant is positive. If not, % make it positive definite using nearPD. Note that mathematically, % checking the determinant is necessary but not sufficient for positive % definiteness. However, checking, say, eigenvalues could get expensive, % so we're lazy and don't bother -- if it's a problem, it will show up % elsewhere. % The 'isCorrMatrix' input argument is used by nearPD to decide whether % the given matrix is a correlation matrix or a regular covariance matrix. % Parse input. parser = inputParser; parser.FunctionName = 'detMakePos'; parser.addRequired('M', @(M)and(isnumeric(M),size(M,1)==size(M,2))); parser.addRequired('isCorrMatrix', @(x)or(x==0,x==1)); parser.parse(M, isCorrMatrix); M = parser.Results.M; isCorrMatrix = parser.Results.isCorrMatrix; % Check determinant. If not positive, change M. detM = det(M); if(detM <= 0) M = nearPD(M,'isCorrMatrix',isCorrMatrix); detM = det(M); assert(detM > 0, 'nearPD failed to make the matrix positive-definite.'); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function val = nonFeatureCodingPenalty(method, h, k, costPerCoefficient) % nonFeatureCodingPenalty % Input: % method: Which coding method to use. % h: # responses. % k: # nonzero responses in the subset. % costPerCoefficient: How many bits does it take to code a coefficient? % Output: % val: The cost of coding nonzero parameters (if any), exept for the % cost of identifying which features are in the model; that part % comes later in the mic.m code. if k == 0 val = 0; else switch method case 'indep' val = k * (costPerCoefficient + 1); % the +1 is for semicolons case 'partial_logh' val = log2(h) + log2nChoosek(h, k) + k * costPerCoefficient; case 'Partial MIC' val = codeForIntegers(k,h) + ... log2nChoosek(h,k) + k * costPerCoefficient; case 'conservativePartial' val = k * twoLogCodeCost(h/k) + k * costPerCoefficient; % !!!CAN CHANGE THE ABOVE TO MAKE IT correspond to actual % distances. case 'Full MIC' assert(h == k, ['Number of rejected ' ... 'hypotheses should equal number of responses.']) val = k * costPerCoefficient; otherwise error('Unknown method'); end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function val = codeForIntegers(k,h,varargin) % codeForIntegers % Input: % k: A positive integer to be encoded. % h: The upper bound on the range of possible integer values. That is, % our code is for the integers {1, 2, ..., h}. % startAtZero (optional): If true, code {0, 1, ..., h}. % Output: % The cost of coding the integer k. % % Based on A universal prior for integers and estimation by minimum % description length, Rissanen 1983. % Parse input. parser = inputParser; parser.FunctionName = 'codeForIntegers'; parser.addRequired('k', @(x)and(isscalar(x),x>=0)); parser.addRequired('h', @(x)and(isscalar(x),x>0)); parser.addParamValue('startAtZero',0,@(x)or(x==0,x==1)); parser.parse(k,h,varargin{:}); k = parser.Results.k; h = parser.Results.h; startAtZero = parser.Results.startAtZero; % Check inputs if(~startAtZero) assert(k>0,'Cannot code 0 unless startAtZero==true.'); end assert(h >= k, 'h must be >= k.') % Starting at zero? if(startAtZero) val = codeForIntegers(k+1,h+1,'startAtZero',0); else % c_omega = log2(2.865); We would use this if we had a code over the % entire integers, but we actually just have a code on 1:h. % So calculate a more appropriate normalization factor. if h==6715 normalizingConst = 1.2155; % Memoized values -- a hack but it works. elseif h==1000 normalizingConst = 1.1987; elseif h==Inf normalizingConst = log2(2.865064); else probabilitiesOneToH = arrayfun(@(k)2^(-log2Star(k)),1:h); normalizingConst = log2(sum(probabilitiesOneToH)); end val = normalizingConst + log2Star(k); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function cost = twoLogCodeCost(dist) % twoLogCodeCost % Input: % dist: An integer distance in {0, 1, 2, ...} to be coded. Hopefully it's % fairly small. % Output: % cost: The coding cost of that distance using a code that's roughly % 2*log(dist). We make some minor modifications for various cases. assert(dist>=0,'dist can not be negative.'); % Add check that dist is an integer. if(dist==0) cost = 2*ceil(log2(dist+2)); else cost = 2*ceil(log2(dist+1)); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % log2Star % Input: % x: A positive number. % Output: % val: log2*(x) function val = log2Star(x) assert(x>0,'Cannot take log2Star of a non-positive number.'); val = 0; curTerm = log2(x); while(curTerm > 0) val = val + curTerm; curTerm = log2(curTerm); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function val = log2nChoosek(n,k) % log2nChosek % Input: % n: A nonnegative integer. % k: A nonnegative integer <= n. % Output: % log base 2 of n choose k, computed to take advantage of the fact that % log grows slowly, so that values can be found even for very large % n and k. % Check inputs assert(isscalar(n) && isscalar(k), 'n and k must be a scalar.') %assert(???, 'n and k must be integers.'); % Is there a way to check that a number is an integer? isinteger doesn't % work.... assert(n >= 0 && k >= 0, 'n and k must be positive.') assert(n >= k, 'n must be >= k.') % Compute, using a formula that works even for very large n and k. val = sum(log2(n-k+1:n)) - sum(log2(1:k)); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function out = negLogLikelihood(epsHat,sigmaHat,varargin) % negLogLikelihood % Input: % epsHat: An n x h matrix of residuals for a particular subset of % features. % sigmaHat: The sample covariance matrix to use. % Optional parameters: % 'sigmaHatInv': The inverse of sigmaHat. % Output: % out: The negative log likelihood term. % Define constants. CHANGE_ME = -5; PRINT_WARNING = 0; % Parse input. parser = inputParser; parser.FunctionName = 'negLogLikelihood'; parser.addRequired('epsHat', @isnumeric); parser.addRequired('sigmaHat', @isnumeric); parser.addParamValue('sigmaHatInv', CHANGE_ME, @isnumeric); parser.addParamValue('isCorrMatrix', false, @(x)or(x==0,x==1)); parser.addParamValue('howMuchCheckPosDef', 2, @(x)or(x==1,x==2)); % 2 == all the time, 1 == only if det <= 0. parser.addParamValue('sigmaHatIsDiag', 0, @(x)or(x==0,x==1)); parser.addParamValue('altHypAndDiagSigmaHat', 0, @(x)or(x==0,x==1)); parser.addParamValue('q', CHANGE_ME, @(x)or(all(x>=0),all(x==CHANGE_ME))); % q is a 1 x h array of the number of features in model, including the intercept % It's not needed unless altHypAndDiagSigmaHat==1. parser.parse(epsHat, sigmaHat, varargin{:}); epsHat = parser.Results.epsHat; sigmaHat = parser.Results.sigmaHat; sigmaHatInv = parser.Results.sigmaHatInv; isCorrMatrix = parser.Results.isCorrMatrix; howMuchCheckPosDef = parser.Results.howMuchCheckPosDef; sigmaHatIsDiag = parser.Results.sigmaHatIsDiag; altHypAndDiagSigmaHat = parser.Results.altHypAndDiagSigmaHat; q = parser.Results.q; % Get n and h. [n h] = size(epsHat); if(~altHypAndDiagSigmaHat) % Do it the usual way. % If sigmaHat isn't positive-definite, recompute one that is. if(~sigmaHatIsDiag && howMuchCheckPosDef==2) if(any(eig(sigmaHat)<=0)) % Then sigmaHat isn't positive-definite. sigmaHat = nearPD(sigmaHat,'isCorrMatrix',isCorrMatrix); % A matrix being positive-definite implies that its inverse is. sigmaHatInv = inv(sigmaHat); fprintf(2,'\nChanging sigmaHat to be positive-definite.'); end end % If necessary, compute sigmaHatInv. if(sigmaHatInv==CHANGE_ME) sigmaHatInv = inv(sigmaHat); end % Get input dimensions. [h1, h2] = size(sigmaHatInv); assert(h==h1 && h==h2,'sigmaHatInv not square!'); [h3 h4] = size(sigmaHat); assert(and(h==h3,h==h4),'sigmaHat not same dimensions as sigmaHatInv.'); % Check determinant of sigmaHat. If sigmaHatIsDiag==1, the determinant is % easy.... if(~sigmaHatIsDiag) detSigmaHat = det(sigmaHat); else detSigmaHat = prod(diag(sigmaHat)); end if(detSigmaHat <= 0 && howMuchCheckPosDef >= 1) out = negLogLikelihood(epsHat,nearPD(sigmaHat,'isCorrMatrix',isCorrMatrix),... 'isCorrMatrix',isCorrMatrix,'howMuchCheckPosDef',... howMuchCheckPosDef); else % Compute. firstTerm = (n*h/2) * log2(2*pi) + (n/2)*log2(detSigmaHat); undividedSecondTerm = 0; % If sigmaHatInv is diagonal, use a shortcut in computing % this second term. %sigmaHatIsDiag = all(all((sigmaHatInv~=0)==eye(size(sigmaHatInv)))); if(sigmaHatIsDiag) summedEpsHatSq = sum(epsHat .^ 2); undividedSecondTerm = summedEpsHatSq * diag(sigmaHatInv); % row * column else for i = 1:n undividedSecondTerm = undividedSecondTerm + ... epsHat(i,:) * sigmaHatInv * epsHat(i,:)'; end end if(undividedSecondTerm <= 0) fprintf(2,'sigmaHatInv was supposed to be positive definite!...\n\n'); end out = firstTerm + undividedSecondTerm / (2 * log(2)); % Check to see if out is negative. It would be unusual if it is, but % not necessarily an error, because we're dealing with density % functions and not discrete probabilities here. if(out <= 0) if(PRINT_WARNING > 0) fprintf(2,'-log2(likelihood) <= 0: %g.\n',out); % print to stderr end end end else % case: altHypAndDiagSigmaHat dividedSumSqErr = sum(epsHat.^2) ./ (n-q); onePlusLogSumSq = 1 + log(2 * pi * dividedSumSqErr); % 1 x h vector out = (n/2) * sum(onePlusLogSumSq); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function [negLogLike candidateNewBetaHat] = ... likelihoodCostOfResponseSubset(subset,curFeature,model,varargin) % Parse input. parser = inputParser; parser.FunctionName = 'likelihoodCostOfResponseSubset'; parser.addRequired('subset'); parser.addRequired('curFeature'); parser.addRequired('model'); parser.parse(subset,curFeature,model,varargin{:}); subset = parser.Results.subset; curFeature = parser.Results.curFeature; model = parser.Results.model; % Use likelihood of MLE betaHat values? % Compute what the new betaHat would be if curFeature were to have % nonzero coefficients with the subset 'subset' of responses. % We don't want to set model's betaHat field to this new betaHat, % though, because we may not end up using it, so we store it as a % temporary 'candidateNewBetaHat'. oldBetaHat = model.betaHat; candidateNewBetaHat = oldBetaHat; cur_q = model.q + subset; for col = find(subset) if(oldBetaHat(curFeature,col)==0) featuresForThisResponse = fast_union(curFeature,find(oldBetaHat(:,col))); candidateNewBetaHat(featuresForThisResponse,col) = ... model.X(:,featuresForThisResponse) \ model.Y(:,col); end end % Get residuals. candidateEpsHat = model.Y - model.X * candidateNewBetaHat; if(model.altHypAndDiagSigmaHat) [n h] = size(candidateEpsHat); dividedSumSqErr = sum(candidateEpsHat.^2) ./ (n-cur_q); onePlusLogSumSq = 1 + log(2 * pi * dividedSumSqErr); % 1 x h vector negLogLike = (n/2) * sum(onePlusLogSumSq); else % Recompute sigmaHat? if(model.useAltSigmaHat) sigmaHat = model.sigmaHatOfResiduals(candidateEpsHat); % This isn't the MLE for sigmaHat, but oh well! sigmaHatInv = inv(sigmaHat); else sigmaHat = model.curNullSigmaHat; sigmaHatInv = model.curNullSigmaHatInv; end % Compute -log2(likelihood). negLogLike = negLogLikelihood(candidateEpsHat,sigmaHat,... 'sigmaHatInv',sigmaHatInv,'isCorrMatrix',model.isCorrMatrix,... 'sigmaHatIsDiag',model.sigmaHatIsDiag,... 'altHypAndDiagSigmaHat',model.altHypAndDiagSigmaHat); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function out_list = fast_union(scalar, in_list) % Because the function union is so slow (enough to be one of the two major % bottlenecks of the MIC code!), I'm writing a faster version that handles % this particular case: where the first argument is just a number. if(any(scalar==in_list)) out_list = in_list; else [nRows nCols] = size(in_list); if(nCols > 1) out_list = [scalar, in_list]; % concatenate horizontally else out_list = [scalar; in_list]; % concatenate vertically end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function [candidateNewBetaHat nonFeatureCostIfAdded bestNegLogLike] = ... bestResponseSubset(curFeature,model,lowestNonfeatureCostEver,varargin) % Define constants. NONE_CONST = -7; % Parse input. parser = inputParser; parser.FunctionName = 'bestResponseSubset'; parser.addRequired('curFeature'); parser.addRequired('model'); parser.addRequired('lowestNonfeatureCostEver'); parser.addParamValue('stopEarly',0,@(x)or(x==0,x==1)); parser.parse(curFeature,model,lowestNonfeatureCostEver,varargin{:}); curFeature = parser.Results.curFeature; model = parser.Results.model; lowestNonfeatureCostEver = parser.Results.lowestNonfeatureCostEver; stopEarly = parser.Results.stopEarly; % Create a function of subsets giving their likelihood cost. A subset is % a 1 x h vector like [0 0 1 0]. subsetNegLogLikeValue = @(subset)likelihoodCostOfResponseSubset(subset,curFeature,model); % Get a lower bound on the model cost for each k > 0. If this lower bound % isn't below lowestNonfeatureCostEver, don't even bother looking. lowerBoundsEachK = subsetNegLogLikeValue(ones(1,model.h)) + ... model.codingCost + ... model.modelCostOfK; maxKToCheck = find(lowerBoundsEachK h.'); out = zeros(1,h); out(i) = 1; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function print_diagnostics(model,bestFeatureToAdd,cur_k,prevLikelihoodCost,prevCodingCost,varargin) % Print important info about the current state of the model in mic.m. % Parse input. parser = inputParser; parser.FunctionName = 'print_diagnostics'; parser.addRequired('model'); parser.addRequired('bestFeatureToAdd'); parser.addRequired('cur_k'); parser.addRequired('prevLikelihoodCost'); parser.addRequired('prevCodingCost'); parser.parse(model,bestFeatureToAdd,cur_k,prevLikelihoodCost,prevCodingCost,varargin{:}); model = parser.Results.model; bestFeatureToAdd = parser.Results.bestFeatureToAdd; cur_k = parser.Results.cur_k; prevLikelihoodCost = parser.Results.prevLikelihoodCost; prevCodingCost = parser.Results.prevCodingCost; % Print. fprintf('#feat=%3i, adding feat. %4i with %2i resp., like.=%4.0f, cod.=%3.0f, chg.like.=%5.0f, chg.code=%5.0f, avg.sig.=%5.4f\n',... model.nFeatures,... bestFeatureToAdd,... cur_k,... model.likelihoodCost,... model.codingCost,... model.likelihoodCost-prevLikelihoodCost,... model.codingCost-prevCodingCost,... mean(diag(model.curNullSigmaHat))); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % nearPD.m % This function is a Matlab rip-off of an R function called 'nearPD' in % the Matrix package: % http://rweb.stat.umn.edu/R/library/Matrix/html/nearPD.html % The complete text of the original R function appears below in a comment. function X = nearPD(X, varargin) % Parse input. parser = inputParser; parser.FunctionName = 'nearPD'; parser.addRequired('X', @isnumeric); parser.addParamValue('maxit', 5000, @(x)x>=1); parser.addParamValue('isCorrMatrix', 1, @(x)or(x==0,x==1)); parser.parse(X, varargin{:}); X = parser.Results.X; maxit = parser.Results.maxit; isCorrMatrix = parser.Results.isCorrMatrix; % Get sizes [n n2] = size(X); assert(n==n2,'X is not square.'); % Check symmetry. SMALL_NUM = .0001; assert(all(all(X-X' EIG_TOL * d(1); % Use p mask. % NOTE: If this code runs slowly, see if there's a way to speed it up % like in the R function.... Q = Q(:,p); X = Q * D(p,p) * Q'; % Update Dykstra's correction. U = X - T; % Project onto symmetric and possibly 'given diag' matrices. X = (X + X')/2; if(isCorrMatrix) X(eye(n)==1)=1; end conv = norm(Y-X,inf) / norm(Y, inf); iter = iter + 1; converged = (conv <= CONV_TOL); end if(~converged) warning('nearPD did not converge in %i iterations.',iter); end if(DO2EIGEN) [Q,D] = eig(X); % Matlab reverses the eigenvalues from the way R does things. Q = fliplr(Q); d = fliplr(diag(D)'); Eps = POSD_TOL * abs(d(1)); % The R code says to take d(1), but Matlab reverses the order of eigenvalues. if(d(n) < Eps) d(d < Eps) = Eps; oDiag = diag(X)'; X = Q * (repmat(d',1,n) .* Q'); D = sqrt(max(Eps, oDiag) ./ diag(X)'); X = repmat(D',1,n) .* X .* repmat(D,n,1); end end if(isCorrMatrix) X(eye(n)==1)=1; end assert(all(eig(X)>0),'The X returned by nearPD is not positive definite!'); %{ ## nearcor.R : ## Copyright (2007) Jens OehlschlŠgel ## GPL licence, no warranty, use at your own risk nearPD <- ## Computes the nearest correlation matrix to an approximate ## correlation matrix, i.e. not positive semidefinite. function(x # n-by-n approx covariance/correlation matrix , corr = FALSE, keepDiag = FALSE , do2eigen = TRUE # if TRUE do a sfsmisc::posdefify() eigen step , only.values = FALSE# if TRUE simply return lambda[j]. , eig.tol = 1e-6 # defines relative positiveness of eigenvalues compared to largest , conv.tol = 1e-7 # convergence tolerance for algorithm , posd.tol = 1e-8 # tolerance for enforcing positive definiteness , maxit = 100 # maximum number of iterations allowed , trace = FALSE # set to TRUE (or 1 ..) to trace iterations ) { stopifnot(isSymmetric(x)) n <- ncol(x) if(keepDiag) diagX0 <- diag(x) ## U should be like x, but filled with '0' -- following also works for 'Matrix': U <- x; U[] <- 0 X <- x iter <- 0 ; converged <- FALSE; conv <- Inf while (iter < maxit && !converged) { Y <- X T <- Y - U ## project onto PSD matrices e <- eigen(Y, symmetric = TRUE) Q <- e$vectors d <- e$values ## D <- diag(d) ## create mask from relative positive eigenvalues p <- d > eig.tol*d[1] ## use p mask to only compute 'positive' part Q <- Q[,p,drop = FALSE] ## X <- Q %*% D[p,p,drop = FALSE] %*% t(Q) --- more efficiently : X <- tcrossprod(Q * rep(d[p], each=nrow(Q)), Q) ## update Dykstra's correction U <- X - T ## project onto symmetric and possibly 'given diag' matrices: X <- (X + t(X))/2 if(corr) diag(X) <- 1 else if(keepDiag) diag(X) <- diagX0 conv <- norm(Y-X, "I") / norm(Y, "I") iter <- iter + 1 if (trace) cat(sprintf("iter %3d : #{p}=%d, ||Y-X|| / ||Y||= %11g\n", iter, sum(p), conv)) converged <- (conv <= conv.tol) } if(!converged) { warning("nearPD() did not converge in ", iter, " iterations") } ## force symmetry is *NEVER* needed, we have symmetric X here! ## X <- (X + t(X))/2 if(do2eigen || only.values) { ## begin from posdefify(sfsmisc) e <- eigen(X, symmetric = TRUE) d <- e$values Eps <- posd.tol * abs(d[1]) if (d[n] < Eps) { d[d < Eps] <- Eps if(!only.values) { Q <- e$vectors o.diag <- diag(X) X <- Q %*% (d * t(Q)) D <- sqrt(pmax(Eps, o.diag)/diag(X)) X[] <- D * X * rep(D, each = n) } } if(only.values) return(d) ## unneeded(?!): X <- (X + t(X))/2 if(corr) diag(X) <- 1 else if(keepDiag) diag(X) <- diagX0 } ## end from posdefify(sfsmisc) structure(list(mat = new("dpoMatrix", x = as.vector(X), Dim = c(n,n), Dimnames = .M.DN(x)), eigenvalues = d, corr = corr, normF = norm(x-X, "F"), iterations = iter, rel.tol = conv, converged = converged), class = "nearPD") } %}