function [p_bp p_cc p_mf lab] = ...
    GO_annot(filename1,Gene_Symbol,gs)

if nargin<1
    filename1 = 'gene_association.goa_human';
end
filename2 = 'GOtree.mat';

tmp = strfind(filename1,'.');
ending = filename1(tmp(end):end);

h = dir(['./GO_files/',filename2]);
if isempty(h)
    create_GOtree;
end

h = dir(['./GO_files/GOdata',ending,'.mat']);
%% Extract data from file

if isempty(h)

    fid = fopen(['./GO_files/',filename1]);
    if fid<=0
        disp('GOA file not found, start downloading latest revision...')
        % urlwrite(['ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_',org,'.gz'],[filename1,'.gz']);
        s0 = urlread(['http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/',filename1,'.gz']);
        cc1 = strfind(s0,[filename1,'.gz?rev']);
        cc1 = cc1(1);
        cc2 = strfind(s0,';');
        cc2 = cc2(find(cc2>cc1,1,'first'))-1;
        tt = s0(cc1:cc2);
        urlwrite(['http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/',tt],['./GO_files/',filename1,'.gz']);
        disp('Download complete, unzipping...')
        gunzip(['./GO_files/',filename1,'.gz']);
        fid = fopen(['./GO_files/',filename1]);
    end

    disp('Extract data from GOA file')
    
    C = textscan(fid,'%1s %*[^\n]');
    fclose(fid);
    C = C{1};
    idx = 1;
    while C{idx} =='!'
        idx = idx+1;
    end

    fid = fopen(['./GO_files/',filename1]);
    C = textscan(fid, '%*s %*s %s %*s %s %*s %*s %*s %s %*s %s %*s %*s %*s %*s','delimiter','\t','HeaderLines',idx-1);
    fclose(fid);

    numLine = size(C{1},1);
    mati = zeros(numLine,1);
    tmp = cell2mat(C{2});
    cats = double(tmp(:,4:end))-48;
    cats = cats*[1e6;1e5;1e4;1e3;1e2;1e1;1];
    
    D = C{1};
    tmp = cell2mat(C{3});
    mati(tmp=='P') = 1;
    mati(tmp=='C') = 2;
    mati(tmp=='F') = 3;
    
    E = C{4};
    
    % cut out "_HUMAN"
    tmp_pos = strfind(E,'_');
    for i=1:length(tmp_pos)
        if ~isempty(tmp_pos{i})
            E{i} = E{i}(1:tmp_pos{i}(1)-1);
        end
    end
    EX = D;
    catsX = cats;
    matiX = mati;    
    
    uE = unique(E);
    
    if length(uE)>1
        disp('Adding synonyms...')
        v = strfind(E,'|');
        lv = zeros(length(E),1);
        for i=1:length(v)
            if ~isempty(E{i})
                lv(i) = length(v{i})+2;
            end
        end
        
        new_length = length(EX)+sum(lv)-sum(lv~=0);
        EX_all = cell(new_length,1);
        catsX_all = zeros(new_length,1);
        matiX_all = zeros(new_length,1);
        nn = length(EX);
        EX_all(1:nn) = EX;
        catsX_all(1:nn) = catsX;
        matiX_all(1:nn) = matiX;
        ri = nn;        
        selu = find(lv>0); 
        h = waitbar(0,'Processing synonyms');
        for i=1:length(selu)
            if mod(i,1e3)==0
                waitbar(i/length(selu),h);        
            end
            
            Et = cell(lv(selu(i))-1,1);
            tmp = [0 v{selu(i)} length(E{selu(i)})+1];
            for j=1:lv(selu(i))-1
                Et{j} = E{selu(i)}(tmp(j)+1:tmp(j+1)-1);
            end
            nn = length(Et);
            EX_all(ri+1:ri+nn) = Et;
            catsX_all(ri+1:ri+nn) = repmat(cats(selu(i)),nn,1);
            matiX_all(ri+1:ri+nn) = repmat(mati(selu(i)),nn,1);

            ri = ri+nn;            
        end       
        waitbar(1,h);
        close(h);
    end

    D = upper(EX_all);
    cats = catsX_all;
    mati = matiX_all;

    clear EX_all EX catsX_all catsX matiX_all matiX
    
    [a b c] = unique(D);
    [a1 b1 c1] = unique(cats);
    [a2 b2 c2] = unique(mati);
    [u1 u2] = unique([c c1 c2],'rows');
    cats = cats(u2);
    mati = mati(u2);
    c = c(u2);
    
    save(['./GO_files/GOdata',ending,'.mat'],'a','c','cats','mati');
end

%% Build direct GO annotation

h = dir(['./GO_files/GOAnn',ending,'.mat']);
if isempty(h) && nargin>1
    
    load(['./GO_files/GOdata',ending,'.mat']);
    D = a(c);
    disp('Build direct GO annotation...for BP')

    GS = upper(Gene_Symbol);

    load(['./GO_files/',filename2],'GOtree');
    s1 = size(GOtree.bp.parents,1);
    s2 = size(GOtree.cc.parents,1);
    s3 = size(GOtree.mf.parents,1);

    sel = find(mati==1);
    cat1 = cats(sel);
    D1 = D(sel);

    m1 = match_names(1:s1,cat1);
    m2 = match_names(GS,D1);
    BP = m2*m1';
    clear D2 cat1
    
    disp('Build direct GO annotation...for CC')

    sel = find(mati==2);
    cat2 = cats(sel);
    D2 = D(sel);

    m1 = match_names(1:s2,cat2);
    m2 = match_names(GS,D2);
    CC = m2*m1';
    clear D2 cat2

    disp('Build direct GO annotation...for MF')

    sel = find(mati==3);
    cat3 = cats(sel);
    D3 = D(sel);

    m1 = match_names(1:s3,cat3);
    m2 = match_names(GS,D3);
    MF = m2*m1';
    clear D3 cat3 m1 m2
    
    %% Create full GO annotation
    disp('Create full GO annotation')

    M = GOtree.bp.parents;
    M = [M sparse(size(M,1),size(M,1)-size(M,2))];
    M0 = M;
    BP0 = BP;
    while sum(M0(:))>0
        BP0 = double((BP0+BP0*M0)~=0);
        M0 = (M0*M)~=0;
    end

    M = GOtree.cc.parents;
    M = [M sparse(size(M,1),size(M,1)-size(M,2))];
    M0 = M;
    CC0 = CC;
    while sum(M0(:))>0
        CC0 = double((CC0+CC0*M0)~=0);
        M0 = (M0*M)~=0;
    end

    M = GOtree.mf.parents;
    M = [M sparse(size(M,1),size(M,1)-size(M,2))];
    M0 = M;
    MF0 = MF;
    while sum(M0(:))>0
        MF0 = double((MF0+MF0*M0)~=0);
        M0 = (M0*M)~=0;
    end
    GOAnn.direct.bp = double(BP~=0);
    GOAnn.direct.cc = double(CC~=0);
    GOAnn.direct.mf = double(MF~=0);
    GOAnn.full.bp = BP0;
    GOAnn.full.cc = CC0;
    GOAnn.full.mf = MF0;
    save(['./GO_files/GOAnn',ending,'.mat'],'GOAnn');
end
if nargin>2 && ~isempty(h)
    load(['./GO_files/GOAnn',ending,'.mat']);
    BP = GOAnn.direct.bp;
    CC = GOAnn.direct.cc;
    MF = GOAnn.direct.mf;
    BP0 = GOAnn.full.bp;
    CC0 = GOAnn.full.cc;
    MF0 = GOAnn.full.mf;
    load(['./GO_files/',filename2],'GOtree');
end
    
%% Compute P-values

if nargin>2
    % remove multiple probes
    GO0 = BP0+CC0+MF0;
    u_list = find(sum(GO0,2)>0);
    BP0 = BP0(u_list,:);
    CC0 = CC0(u_list,:);
    MF0 = MF0(u_list,:);
    Gene_Symbol = Gene_Symbol(u_list);
    gs = gs(u_list,:);
    
    [UG u_list] = unique(Gene_Symbol);
    BP0 = BP0(u_list,:);
    CC0 = CC0(u_list,:);
    MF0 = MF0(u_list,:);
    kk = match_names(UG,Gene_Symbol);
    gs = kk*gs;    
    
    disp('Compute P-values for BP')

    Genome = [1:size(gs,1)]';
    lnGFacts = gammaln(1:length(Genome)+1);
    gs0 = sparse(gs~=0);
    Ns = sum(gs0,1);
    Ks = sum(BP0,1);
    Isct = (gs0')*BP0;
    tot_terms = length(find(sum(BP0,1)~=0));
    [I m n] = unique(Isct','rows');
    Ks = Ks(m);
    Isct = I';
    P = zeros(size(gs,2),size(Isct,2));
    h = waitbar(0);
    set(h,'Name','BP');
    for i=1:size(gs,2)
        waitbar(i/size(gs,2),h,[num2str(i),' / ',num2str(size(gs,2))]);
        sel = find(Isct(i,:)>0);
        for j=1:length(sel)
            P(i,sel(j)) = calcEnrichmentPVal(Ns(i),Ks(sel(j)),Isct(i,sel(j)),length(Genome),lnGFacts);
        end
    end
    close(h);
    P = P(:,n);
    p_bp = P;
    p_bp(P~=0) = max(P(P~=0)-log10(tot_terms),0);
    p_bp = sparse(p_bp);
    lab = GOtree.bp.term;

    disp('Compute P-values for CC')
    Ks = sum(CC0,1);
    Isct = (gs0')*CC0;
    tot_terms = length(find(sum(CC0,1)~=0));
    [I m n] = unique(Isct','rows');
    Ks = Ks(m);
    Isct = I';
    P = zeros(size(gs,2),size(Isct,2));
    h = waitbar(0);
    set(h,'Name','CC');
    for i=1:size(gs,2)
        waitbar(i/size(gs,2),h,[num2str(i),' / ',num2str(size(gs,2))]);
        sel = find(Isct(i,:)>0);
        for j=1:length(sel)
            P(i,sel(j)) = calcEnrichmentPVal(Ns(i),Ks(sel(j)),Isct(i,sel(j)),length(Genome),lnGFacts);
        end
    end
    close(h);
    P = P(:,n);
    p_cc = P;
    p_cc(P~=0) = max(P(P~=0)-log10(tot_terms),0);
    p_cc = sparse(p_cc);

    disp('Compute P-values for MF')
    Ks = sum(MF0,1);
    Isct = (gs0')*MF0;
    tot_terms = length(find(sum(MF0,1)~=0));
    [I m n] = unique(Isct','rows');
    Ks = Ks(m);
    Isct = I';
    P = zeros(size(gs,2),size(Isct,2));
    h = waitbar(0,'Processing modules');
    set(h,'Name','MF');
    for i=1:size(gs,2)
        waitbar(i/size(gs,2),h,[num2str(i),' / ',num2str(size(gs,2))]);
        sel = find(Isct(i,:)>0);
        for j=1:length(sel)
            P(i,sel(j)) = calcEnrichmentPVal(Ns(i),Ks(sel(j)),Isct(i,sel(j)),length(Genome),lnGFacts);
        end
    end
    close(h);
    P = P(:,n);
    p_mf = P;
    p_mf(P~=0) = max(P(P~=0)-log10(tot_terms),0);
    p_mf = sparse(p_mf);
end
%% Enrichment P-values (by Ron Chen)
% calculate the P value for obtaining enrichment between gene clusters and
% GO terms / transcriptional modules. Implements expression taken from
% 'systematic determination ... network architecture' by Tavazoie & Church

function P=calcEnrichmentPVal(N,K,x,M,lnGFacts)
% P=calcEnrichmentPVal(N,K,x,M,lnGFacts) calculate the P value for
% over-representation of one set within another according to the
% hypergeometric probability density function:
%
%                   (K)(M-K)
%                   (x)(N-x)            K!(M-K)!N!(M-N)!
% y = f(x|M,K,N) = ----------  =   ----------------------------
%                     (M)           x!(K-x)!(M-K+x-N)!(N-x)!M!
%                     (N)
%
% The result, y, is the probability of drawing exactly x of a possible K
% items in N drawings without replacement from a group of M objects. The
% probability computed below is to draw x OR MORE items.
%
% =========== INPUT ============
% N: scalar, # of genes in the cluster (module or gene list)
% K: scalar, # of genes in the GO category, from the defined genome OR
%   number of genes in the transcriptional module onto which the projection
%   is done
% x: scalar, # of genes in the intersection between the cluster and
%   the category
% M: scalar, # of genes in the defined genome
% lnGFacts: a vector of length nGenesGenome, whose elements are the
%   natural logarithms of the factorials of each element of the vector
%   1:nGenesGenome. This is to avoid repeated recalculation of the
%   factorials.
%
% written by Ron Chen 5/10/03


%%

if nargin<5
    lnGFacts=gammaln(1:M+1);
end

Xs = x : min(K,N);

Kx = lnGFacts(K+1) - (lnGFacts(K-Xs+1)+lnGFacts(Xs+1));           % ones are added to the index because every element of lnGFacts
MKNx = lnGFacts(M-K+1) - (lnGFacts(M-K-N+Xs+1)+lnGFacts(N-Xs+1));     % is the ln factorial of it's preceeding index.
MN = lnGFacts(M+1) - (lnGFacts(M-N+1)+lnGFacts(N+1));

PVal = sum( exp(Kx+MKNx-MN) );

P=-log10(PVal);
