function f = run_parse_domains(froot,DPparameters,input_dir,output_dir) % % This function calls the function parse_domains for different values of the threshold T used % to 'binarize' the N-matrix. It carries out a search of the T value that optimizes the score. % % froot: A character string containing the concatenation of the pdb code and the % chain ID, e.g., 5p21A, 2cbiA, 1wgtA, 1skyE, etc. % Note: froot must corresponds to the input files (see below). % DPparameters: The structure containing the parameters required by the program % If the structure does not exist yet it is read on file % specified by DPparameters % input_dir: The name of the directory containing the required input files % (froot.mathlab, froot_pdbnum.txt and froot_CAdist.txt files) % output_dir: The name of the directory containing the results % if(ischar(DPparameters)) DPparameters = read_parameters(DPparameters); end DPparameters.pdbcode = froot(1:4); DPparameters.chainId = froot(5:5); DPparameters.output_dir = [output_dir,'/']; if(exist(output_dir,'dir') == 0) mkdir(output_dir); end % % Read on files the required data, vast_res is a structure that contains the results % of VAST for the current protein, CAdist is the matrix of CA distances, % pdbnum is the PDB numbering for the protein and NN is the N-matrix, . % [CAdist, pdbnum, vast_res] = read_protein_data(input_dir,froot,DPparameters.PadLength,DPparameters.Rmsd_threshold); NN = vast_res.matali' * vast_res.matali; if(DPparameters.dbg >= 0) % figure(105); h=figure('Visible', 'off'); imagesc(NN); print('-dpng',[output_dir,'/',froot,'_Nmatrix']); % close(105); close(h); h=figure('Visible', 'off'); contour(NN); print('-dpng',[output_dir,'/',froot,'_Nmatrix_contour']); % close(105); close(h); end % % Search for the value of T that provides the smallest score. % if(strcmp(DPparameters.output_file,'void')) fid = -1; else fid = fopen([output_dir,'/',froot,'.',DPparameters.output_file],'w'); end DPparameters.fid = fid; if(fid ~= -1) fprintf(fid,'Exploring different threshold values, T, for protein: %s\n\n',froot); end Tmp = [0:50:500,600:100:1500,1750:250:max(max(NN))]; for k = 1:length(Tmp) if(Tmp(k) > DPparameters.Tstart) indx = k-1; break; end end Tmp(indx) = DPparameters.Tstart; Tvec = Tmp(indx:length(Tmp)); % % Determine the smallest score value on a unidimensional grid % Npts = length(Tvec); s = cell(Npts,1); for k = 1:length(Tvec) DPparameters.T = Tvec(k); s{k} = parse_domains(NN,CAdist,pdbnum,DPparameters); if(isempty(s{k})) Npts = k - 1; break; end if(fid ~= -1) fprintf(fid,'\n Threshold value T= %d\n',Tvec(k)); prt_dom(fid,s{k}); fprintf(fid,' Consistency= %.3f score= %.3f tfunc=%.3f\n',s{k}.consistency,s{k}.score,s{k}.score*DPparameters.wgt1+s{k}.consistency*DPparameters.wgt2); end end % % Analyze the results % if(Npts == 0) disp(['WARNING: Npts == 0 meaning that nothing was found even for T= ',num2str(Tvec(1))]); if(fid ~= -1) fprintf(fid,'WARNING: Npts == 0 meaning that nothing was found even for T= %d',Tvec(1)); end prt.score = -1; prt.stretches = [1,size(CAdist,1)]; prt.domains = 1:size(CAdist,1); sopt = struct('T',[],'score',-1,'consistency',-1,'dom_history',[],'final_domains',prt,'ori_domains',[],'tfunc',-1); f = struct('protein',froot,'s',sopt,'parameters',DPparameters); print_NDO(sopt.final_domains,pdbnum,[output_dir,'/T',froot,'DP.txt']); return; end s = s(1:Npts); minQ = 1000000000.; for k = 1:Npts if(s{k}.score < minQ) minQ = s{k}.score; indx = k; end end score = zeros(Npts,1); consistency = zeros(Npts,1); ndom = zeros(Npts,1); tfunc = zeros(Npts,1); for k = 1:Npts score(k) = s{k}.score; consistency(k) = s{k}.consistency; tfunc(k) = score(k) * DPparameters.wgt1 + consistency(k) * DPparameters.wgt2; ndom(k) = length(s{k}.final_domains); end % % Plot the curve for the score % if(DPparameters.dbg >= 1) % figure(100) h = figure('Visible', 'off'); Cdom = cell(Npts,1); thresh = zeros(1,Npts); for k = 1:Npts Cdom{k} = num2str(length(s{k}.final_domains)); thresh(k) = s{k}.T; end plot(thresh,tfunc); deltay = max(tfunc)*0.02; deltax = (thresh(Npts)-thresh(1))*0.02; for k = 1:Npts text(thresh(k)+deltax,tfunc(k)+deltay,Cdom{k}); end print('-dpdf',[output_dir,'/',froot,'_tfunc']); % close(100); close(h); end % % Select the optimal partitioning of the protein % if(fid ~= -1) fprintf(fid,'\n'); end partition = sort(unique(ndom)); pos = 1:length(ndom); min_tfunc = zeros(length(partition),1); min_pos = zeros(length(partition),1); for k = 1:length(partition) tf = ndom == partition(k); pos1 = pos(tf); [min_tfunc(k), ii] = min(tfunc(tf)); min_pos(k) = pos1(ii); if(fid ~= -1) fprintf(fid,'Exploration generated %d times: %d domain(s) with minimum tfunc= %.3f\n',sum(tf),partition(k),min_tfunc(k)); end end if(partition(1) == 1) % % Consistency is always equal to zero when there is 1 domain % [mc,ii] = min(min_tfunc(2:length(min_tfunc))); if(mc < DPparameters.tfunc_threshold) opt = min_pos(ii+1); else opt = min_pos(1); end else [mc,ii] = min(min_tfunc(1:length(min_tfunc))); opt = min_pos(ii); end sopt = s{opt}; if(fid ~= -1) fprintf(fid,'\nOptimal partition selected has %d domains\n',length(s{opt}.final_domains)); end prt_dom(fid,sopt); tfunc = sopt.score*DPparameters.wgt1+sopt.consistency*DPparameters.wgt2; if(fid ~= -1) fprintf(fid,'Consistency= %.3f score= %.3f tfunc= %.3f\n',sopt.consistency,sopt.score,tfunc); end sopt.tfunc = tfunc; % % Print chimera script and a file used for NDO computation % print_chimera(s{opt}.final_domains,pdbnum,DPparameters.pdbcode,DPparameters.chainId,[output_dir,'/',froot,'.cmd']); print_NDO(s{opt}.final_domains,pdbnum,[output_dir,'/T',froot,'DP.txt']); % % Save the results on a file % f = struct('protein',froot,'s',sopt,'parameters',DPparameters); save([output_dir,'/Dom',froot],'-struct','f'); %-------------------------------------------------------------------------- % %-------------------------------------------------------------------------- function prt_dom(fid,s) for j = 1:length(s.final_domains) dm = []; for l = 1:size(s.final_domains(j).stretches,1) dm = [dm,' [',num2str(s.final_domains(j).stretches(l,1)),'-',num2str(s.final_domains(j).stretches(l,2)),']']; end fprintf(fid,' Dom %d : %s\n',j,dm); end %-------------------------------------------------------------------------- % %-------------------------------------------------------------------------- function print_NDO(prot,pdbnum2,file) % % Create a file for computing NDO score % prot: structure containing the domain descriptions % pdbnum: the PDB numbering scheme % file: name of the output file % nn = length(prot); % % pdbnum2(1,:) contains the pdb residue numbers % pdbnum2(2,:) contains the insertion code % Note: -1? means that the corresponding residue is missing % pdbnum = pdbnum2(1,:); incode = pdbnum2(2,:); % fid = fopen(file,'w'); fprintf(fid, 'P.SVD\t'); for k = 1:nn r = prot(k).stretches; [nrow,ncol] = size(r); fprintf(fid,'D%d:',k); for l = 1:nrow fprintf(fid,'%d-%d ',pdbnum(r(l,:))); end end fclose(fid); %-------------------------------------------------------------------------- % %-------------------------------------------------------------------------- function print_chimera(prot,pdbnum2,pdbcode,chainId,file) % % Create a chimera script that will be used in chimera to color % residues belonging to the domains. % prot: structure containing the domain descriptions % pdbnum: the PDB numbering scheme % pdbcode: the pdb code for the protein % chainId: the name of the protein chain % file: is the name of the rasmol script file % nn = length(prot); % % pdbnum2(1,:) contains the pdb residue numbers % pdbnum2(2,:) contains the insertion code % Note: -1? means that the corresponding residue is missing % pdbnum = pdbnum2(1,:); incode = pdbnum2(2,:); % % colors: is an array of strings representing colors used in CHIMERA. % The colors correspond to name, , e.g., 'red', 'blue', 'green', or % If there are more domains than elements in array colors % the colors are "recycled" (only when the number of domains is % greater than 10). % if(nn <= 4) colors = char('red','yellow','cornflower blue','forest green'); elseif (nn <= 6) colors = char('red','orange','yellow','cyan','green','cornflower blue'); elseif(nn <= 8) colors = char('red','orange','yellow','green','forest green','cyan','blue','purple'); else colors = char('red','orange','yellow','green','forest green','cyan','cornflower blue','purple','hot pink','magenta'); end [ncolors,tmp] = size(colors); % fid = fopen(file,'w'); fprintf(fid,'set bg_color white\nopen %s\ndelete ~ :.%s\n',pdbcode,chainId); fprintf(fid,'ribbon\nribrep rounded\n~disp\n'); for k = 1:nn r = prot(k).stretches; [nrow,ncol] = size(r); fprintf(fid,'alias dom%d :',k); for l = 1:(nrow-1) fprintf(fid,'%d-%d,',pdbnum(r(l,:))); end fprintf(fid,'%d-%d\n',pdbnum(r(nrow,:))); ic = mod(k,ncolors); if(ic == 0) ic = ncolors; end fprintf(fid,'color %s dom%d\n',colors(ic,:),k); end fclose(fid);