function [p lab] = ...
    KEGG_annot(filename,Gene_Symbol,gs)

if nargin==0
    filename = 'hsa';
end

h = dir(['./KEGG_files/KEGGdata.',filename,'.mat']);

%%
if isempty(h)
    fid = fopen(['./KEGG_files/',filename,'.keg'],'r');
    if fid<=0
        disp('KEGG file not found, start downloading...')
        urlwrite(['ftp://ftp.genome.jp/pub/kegg/brite/organisms/hsa/',filename,'00001.keg'],['./KEGG_files/',filename,'.keg']);
        fid = fopen(['./KEGG_files/',filename,'.keg'],'r');
    end

    disp('Extract data from file')

    C = fread(fid,'*char');
    fclose(fid);
    C = C';

    k0 = strfind(C,'A<B>');
    k0b = strfind(C,'</B>');
    k1 = strfind(C,'B  <B>');
    k2 = strfind(C,'C    ');
    k2b = strfind(C,'[PATH:');
    str = 'D      <a href="/dbget-bin/www_bget?hsa+';
    k3 = strfind(C,str)+length(str);
    k3b = strfind(C,'</a> ');
    k3c = strfind(C,';');

    ptw_num = zeros(length(k2b),1);
    ptw_name = cell(length(k2b),1);
    for i=1:length(ptw_num)
        ptw_num(i) = str2double(C(k2b(i)+9:k2b(i)+13));
        ep1 = k2(find(k2<k2b(i),1,'last'))+11;
        ep2 = k2b(i)-2;
        ptw_name{i} = C(ep1:ep2);
    end

    catA_num = zeros(length(k0),1);
    catA_name = cell(length(k0),1);
    for i=1:length(k0)
        catA_num(i) = str2double(C(k0(i)+4:k0(i)+8));
        ep = k0b(find(k0b>k0(i),1,'first'));
        catA_name{i} = C(k0(i)+10:ep-1);
    end
    
    catB_num = zeros(length(k1),1);
    catB_name = cell(length(k1),1);
    for i=1:length(k1)
        catB_num(i) = str2double(C(k1(i)+6:k1(i)+10));
        ep = k0b(find(k0b>k1(i),1,'first'));
        catB_name{i} = C(k1(i)+12:ep-1);    
    end
    
    KEGGTree = sparse(eye(max(ptw_num)));
    for i=1:length(ptw_num)
        sA = catA_num(find(k0<k2b(i),1,'last'));
        sB = catB_num(find(k1<k2b(i),1,'last'));
        KEGGTree(i,[sA sB]) = 1;
    end
    
    lab = cell(max(ptw_num),1);
    lab(ptw_num) = ptw_name;
    lab(catA_num) = catA_name;
    lab(catB_num) = catB_name;
    for i=1:length(lab)
        if isempty(lab{i})
            lab{i} = '';
        end
    end

    PTW = sparse(1e5,max(ptw_num));
    gene_list = cell(1e5,1);
    ri = 0;
    for i=1:length(k3);
        sp = k3b(find(k3b>k3(i),1,'first'))+5;
        ep = k3c(find(k3c>sp,1,'first'))-1;
        st = C(sp:ep);
        id_y = ptw_num(find(k2b<sp,1,'last'));

        c_pos = strfind(st,',');
        c_pos = [-1 c_pos length(st)+1];
        for j=1:length(c_pos)-1
            gene_list{ri+j} = st(c_pos(j)+2:c_pos(j+1)-1);
            PTW(ri+j,id_y) = 1;
        end
        ri = ri+length(c_pos)-1;
    end
    PTW = PTW(1:ri,:);
    gene_list = gene_list(1:ri);

    UG = unique(gene_list);
    k = match_names(UG,gene_list);
    PTW = double(k*PTW~=0);
    gene_list = UG;
    
    sel = zeros(length(gene_list),1);
    for i=1:length(gene_list)
        tmp = double(gene_list{i}(1));
        if tmp>=65 && tmp<=90
            sel(i) = 1;
        end
    end
    gene_list = gene_list(sel==1);
    PTW = PTW(sel==1,:);
    
    PTW = PTW*KEGGTree;

    save(['./KEGG_files/KEGGdata.',filename,'.mat'],'PTW','gene_list','lab');
    
end

%% Build KEGG annotation

h = dir(['./KEGG_files/KEGGAnn',filename,'.mat']);
if nargin>1 && isempty(h)

    load(['./KEGG_files/KEGGdata.',filename,'.mat']);
    disp('Build KEGG annotation')

    GS = upper(Gene_Symbol);
    lks = match_names(GS,gene_list);
    KEGG = lks*PTW;
    ptw_name = lab;

    save(['./KEGG_files/KEGGAnn',filename,'.mat'],'KEGG','Gene_Symbol','ptw_name');
end
    
%% Compute P-values


if nargin>2
    load(['./KEGG_files/KEGGAnn',filename,'.mat']);
    % remove unannotated or multiple probes
    
    u_list = find(sum(KEGG,2)>0);
    KEGG = KEGG(u_list,:);
    Gene_Symbol = Gene_Symbol(u_list);
    gs = gs(u_list,:);
    
    [UG u_list] = unique(Gene_Symbol);
    KEGG = KEGG(u_list,:);
    kk = match_names(UG,Gene_Symbol);
    gs = kk*gs;
    
    disp('Compute P-values for KEGG pathways')

    Genome = [1:size(gs,1)]';
    lnGFacts=gammaln(1:length(Genome)+1);
    Ns = sum(gs~=0,1);
    Ks = sum(KEGG,1);
    Isct = (gs'~=0)*KEGG;
    tot_terms = length(find(sum(KEGG,1)~=0));
    [I m n] = unique(Isct','rows');
    Ks = Ks(m);
    Isct = I';
    P = zeros(size(gs,2),size(Isct,2));
    h = waitbar(0);
    set(h,'Name','KEGG');
    for i=1:size(gs,2)
        waitbar(i/size(gs,2),h,[num2str(i),' / ',num2str(size(gs,2))]);
        sel = find(Isct(i,:)>0);
        for j=1:length(sel)
            P(i,sel(j)) = calcEnrichmentPVal(Ns(i),Ks(sel(j)),Isct(i,sel(j)),length(Genome),lnGFacts);
        end
    end
    close(h);
    P = P(:,n);
    p = P;
    p(P~=0) = max(P(P~=0)-log10(tot_terms),0);
    p = sparse(p);
    lab = ptw_name;
end

%% matching cells (by Sven)
function [name_links common_names unique_names1 unique_names2] = match_names(names1, names2)

% get unique names from both name lists
[unique_names1 dummy idx1] = unique(names1);
[unique_names2 dummy idx2] = unique(names2);

length1 = length(unique_names1);
length2 = length(unique_names2);

% do matching using the 'intersect' function
[common_names, i1, i2] = intersect(unique_names1,unique_names2);

% build link matrix
unique_name_links = sparse(length1, length2);
for i=1:length(common_names)
    unique_name_links(i1(i),i2(i)) = 1;
end

% expand for non-unique entries
name_links = unique_name_links(idx1,idx2);

%% Enrichment P-values (by Ron Chen)
% calculate the P value for obtaining enrichment between gene clusters and
% GO terms / transcriptional modules. Implements expression taken from
% 'systematic determination ... network architecture' by Tavazoie & Church

function P=calcEnrichmentPVal(N,K,x,M,lnGFacts)
% P=calcEnrichmentPVal(N,K,x,M,lnGFacts) calculate the P value for
% over-representation of one set within another according to the
% hypergeometric probability density function:
%
%                   (K)(M-K)
%                   (x)(N-x)            K!(M-K)!N!(M-N)!
% y = f(x|M,K,N) = ----------  =   ----------------------------
%                     (M)           x!(K-x)!(M-K+x-N)!(N-x)!M!
%                     (N)
%
% The result, y, is the probability of drawing exactly x of a possible K
% items in N drawings without replacement from a group of M objects. The
% probability computed below is to draw x OR MORE items.
%
% =========== INPUT ============
% N: scalar, # of genes in the cluster (module or gene list)
% K: scalar, # of genes in the GO category, from the defined genome OR
%   number of genes in the transcriptional module onto which the projection
%   is done
% x: scalar, # of genes in the intersection between the cluster and
%   the category
% M: scalar, # of genes in the defined genome
% lnGFacts: a vector of length nGenesGenome, whose elements are the
%   natural logarithms of the factorials of each element of the vector
%   1:nGenesGenome. This is to avoid repeated recalculation of the
%   factorials.
%
% written by Ron Chen 5/10/03


%%

Xs = x : min(K,N);

Kx = lnGFacts(K+1) - (lnGFacts(K-Xs+1)+lnGFacts(Xs+1));           % ones are added to the index because every element of lnGFacts
MKNx = lnGFacts(M-K+1) - (lnGFacts(M-K-N+Xs+1)+lnGFacts(N-Xs+1));     % is the ln factorial of it's preceeding index.
MN = lnGFacts(M+1) - (lnGFacts(M-N+1)+lnGFacts(N+1));

PVal = sum( exp(Kx+MKNx-MN) );

P=-log10(PVal);

