Machine Learning with MATLAB

From XenopusBioinfo
Jump to: navigation, search
%import our Definitions File.  This file contains accession numbers, genes symbols and other information
Defs = importdata('Defs.txt');
 
%import our data file.  This file contains a header row, and then accession numbers, and data points for different time periods
A = importdata('A.tab');
 
%Store the first column of the definitions file (the accession numbers) in Defs.IDs
Defs.IDs = Defs.textdata(:,1);
 
%Store the third column of the definitions file (the gene symbols) in Defs.sym
Defs.sym = Defs.textdata(:,3);
 
%Store the first column of the definitions file (the accession numbers) in A.IDs.  Skip the first row (header). The header is skipped by using 2 : end.
A.IDs = A.textdata( 2 : end , 1) ;
 
noise=5;
%Identify the genes where sum of all datapoints for a given gene are above noise. Store this identification (is the sum of data in geneA above threshold) in A.overNoise
A.overNoise = (sum(A.data,2) > noise);
 
%Take all the data points (A.data) of the genes above our threshold (A.overNoise) and store in A.raw_data
A.raw_data = A.data(A.overNoise, : );
 
%Take the IDs (A.IDs) for the genes that are overNoise (A.overNoise), and store them back in A.IDs.  This writes over the old data stored in A.IDs.  This means that A.IDs only contains the IDs of the genes overNoise
A.IDs = A.IDs(A.overNoise);
 
%Normalize the data by the mean of the data at each time point (1-18).  In mean(A.raw_data, 2), the "2" means to look at the columns (each time period)
A.norm_data = A.raw_data ./ repmat( mean(A.raw_data ,2) , 1, 18) ;
 
%Set the number of clusters that we want
cln = 24;
 
%Assign genes to clusters with kmeans clustering on data into # clusters (A.norm_data , cln), using cosine ('dist', 'cosi') to calculate, using a maximum of 100000 ('maxiter', 100000)
[ind, c] = kmeans(A.norm_data, cln, 'dist', 'cosi', 'maxiter', 100000);
 
%Store the normalized data in data.
data = A.norm_data;
 
%open up a new figure
figure;
 
%Set the lineWidth of the plots to 1
lw = 1;
 
%For each of the clusters (i = 1:cln)
for i = 1:cln; 
 
%create the subplot/graph for the current cluster (i) within our 4x6 graph grid
subplot(4, 6, i);
 
%Get all genes in cluster i, and store those genes in whichG.  This is not storing the DATA for the genes into whichG, just an identifier of the gene
whichG = (ind == i);
 
%Get all the data for the genes in the current cluster (data(whichG, :))
dd = data(whichG, :); 
 
%Plot the data onto the plot.  Use a line width of lw
plot(dd', 'LineWidth',lw);
 
%End the for loop
end