clear all

%% Plug in external LD data (e.g. HapMap or use your own data)

for ii=1:22
    fprintf('Downloading Chr%d...\n',ii);
    [a,b] = urlwrite(sprintf('http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseII+III_ncbi_b36/forward/non-redundant/genotypes_chr%d_CEU_r27_nr.b36_fwd.txt.gz',ii),...
        sprintf('./LD_data/hapmapCEU_chr%d.gz',ii));
end

%%%% This will take a while (2-3h) %%%%
for ii=1:22
    fprintf('Converting Chr%d to matlab data...\n',ii);
    fn = sprintf('./LD_data/hapmapCEU_chr%d.gz',ii);
    gunzip(fn);

    C = read_file(fn(1:end-3),0,' ',1);
    rs0 = id2rsnum(C{1});
    als = cell2mat(C{2});
    a_A = als(:,1);
    a_B = als(:,3);
    pos0 = str2double(C{4});
    C = C(12:end);
    M = zeros(length(rs0),length(C));
    for i=1:length(C)
        if mod(i,10)==0
            fprintf('Individual %d / %d is being processed...\n',i,length(C));
        end
        tmp = cell2mat(C{i});
        M(:,i) = double(tmp(:,1)==a_B)+double(tmp(:,2)==a_B);
        sel = find(tmp(:,1)=='N');
        M(sel,i) = NaN;
    end
    save(sprintf('hapmap_chr%d.mat',ii),'M','rs0','pos0','a_A','a_B');
    delete(fn(1:end-3));
end

%% build annotation file
chr = [];
pos = [];
rs = [];
for ii=1:22
    load(sprintf('hapmap_chr%d.mat',ii),'rs0','pos0');
    chr = [chr;ii*ones(length(rs0),1)];
    pos = [pos;pos0];
    rs = [rs;rs0];
end

save annot.mat chr pos rs

%% metal output files
%%%%%%%%% change here to the location of the metal output files
fn1 = './metal_data/example_discovery'; % metal output file for the discovery cohorts
fn2 = './metal_data/example_replication'; % metal output file for the replication cohorts

%% save metal output in Matlab

[rs aA aB af x se p n] = read_in_metal_output(fn1);
[chr pos] = rs2chrGpos(rs);
save('discovery_data.mat','x','se','p','n','rs','chr','pos','aA','aB','af');

[rs aA aB af x se p n] = read_in_metal_output(fn2);
[chr pos] = rs2chrGpos(rs);
save('replication_data.mat','x','se','p','n','rs','chr','pos','aA','aB','af');

%% match-up discovery and replication
clear all
load discovery_data.mat
x1 = x;
se1 = se;
p1 = p;
n1 = n;
rs1 = rs;
chr1 = chr;
pos1 = pos;
aA1 = aA;
aB1 = aB;

load replication_data.mat
x2 = x;
se2 = se;
p2 = p;
n2 = n;
rs2 = rs;
chr2 = chr;
pos2 = pos;
aA2 = aA;
aB2 = aB;

[~,a,b] = intersect(rs1,rs2);

rs = rs1(a);
x1 = x1(a);
se1 = se1(a);
p1 = p1(a);
n1 = n1(a);
aA1 = aA1(a);
aB1 = aB1(a);
chr = chr2(b);
pos = pos2(b);

x2 = x2(b);
se2 = se2(b);
p2 = p2(b);
n2 = n2(b);
aA2 = aA2(b);
aB2 = aB2(b);

saA2 = lower(allele_swap(aA2));
saB2 = lower(allele_swap(aB2));
s1 = find((aA1==aA2 & aB1==aB2) | (aA1==saA2 & aB1==saB2));
s2 = find((aA1==aB2 & aB1==aA2) | (aA1==saB2 & aB1==saA2));
s2 = setdiff(s2,s1);
x2(s2) = -x2(s2);
s0 = unique([s1;s2]);

aA = aA1(s0);
aB = aB1(s0);
x1 = x1(s0);
x2 = x2(s0);
se1 = se1(s0);
se2 = se2(s0);
p1 = p1(s0);
p2 = p2(s0);
n1 = n1(s0);
n2 = n2(s0);
chr = chr(s0);
pos = pos(s0);

R21 = 1./(1+n1./(norminv(p1/2).^2));
R22 = 1./(1+n2./(norminv(p2/2).^2));

save('combined_data.mat','R21','R22','aA','aB','chr',...
    'pos','n1','n2','p1','p2','rs','x1','x2','se1','se2');

%% find loci of interest
clear all

%%%% selection parameters %%%%%%%%%%%%%%%%%%%%
pTrh = 1e-2; % P-value threshold
posPrune = 1e6; % pruning genetic distance in bp
nL = 1e4; % limit in available sample size
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

load combined_data.mat

sel = find(p1<pTrh & n1>nL & n2>nL);
su = hit2loci(p1(sel),chr(sel),pos(sel),posPrune);
sel = sel(su);

save selected_loci_index.mat sel


%% perform locus association
clear all
%%%% Parameters to modify
GC = 'on'; % Genomic control correction? %
pCutOff = 1e-2; % P-value cutoff for discovery SNP selection
ld_trh = .1; % LD-pruning threshold for discovery SNP selection
locusSize = 5e5; % Should be maximum posPrune/2
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

load combined_data.mat
load selected_loci_index.mat

if strcmp(GC,'on')
    p2 = genomic_control(p2);
    R22 = 1./(1+n2./(norminv(p2/2).^2));
end

p_LR = zeros(length(sel),1); % Likelihood ration test: multiSNP vs top SNP
r2_multiSNP = p_LR; % Explained variance estimate of the multi-SNP
r2_top = p_LR; % explained variance estimate of the top SNP

for j=1:length(sel)
    if mod(j,10)==0
        fprintf('Processing locus %d / %d...\n',j,length(sel));
    end
    ch = chr(sel(j));
    ps = pos(sel(j));
    su = find(chr==ch & pos>=ps-locusSize & pos<=ps+locusSize & n1>1e4);

    %%%% Match up with SNP set of the external (HapMap) cohort
    load(sprintf('hapmap_chr%d.mat',ch));
    [tmp a1 a2] = intersect(rs0,rs(su));
    a2 = su(a2);
    
    F0 = M(a1,:)';
    al1 = a_A(a1);
    al2 = a_B(a1);
    
    al1 = lower(char(al1));
    al2 = lower(char(al2)); 
    r1 = sign(x1(a2)).*sqrt(R21(a2));
    r2 = sign(x2(a2)).*sqrt(R22(a2));
    s1 = find(al1==aA(a2) & al2==aB(a2));
    s2 = find(al1==aB(a2) & al2==aA(a2));
    r1(s2) = -r1(s2);
    r2(s2) = -r2(s2);
    s0 = unique([s1;s2]);
    a1 = a1(s0);
    a2 = a2(s0);
    
    %%%% sort according to physical position

    [sv sp] = sort(pos(a2));
    a1 = a1(sp);
    a2 = a2(sp);
    s0 = s0(sp);

    r1 = r1(s0);
    r2 = r2(s0);
    F0 = F0(:,s0);
    al1 = al1(s0);
    al2 = al2(s0);
    n10 = n1(a2);
    n20 = n2(a2);
    ch0 = chr(a2);
    ps0 = pos(a2);
    rs0 = rs(a2);
    p10 = p1(a2);
    p20 = p2(a2);
    m = size(F0,2);

    %%%% select 'nominally' significant SNPs
    ss = find(p10<pCutOff);
    al1_ss = al1(ss);
    al2_ss = al2(ss);
    r1_ss = r1(ss);
    r2_ss = r2(ss);
    F0_ss = F0(:,ss);
    n1_ss = n10(ss);
    n2_ss = n20(ss);
    ch0_ss = ch0(ss);
    ps0_ss = ps0(ss);
    rs0_ss = rs0(ss);
    p1_ss = p10(ss);
    nn = median(n2_ss);
    C = corrcoef_NAN(F0_ss);

    %%%%%% prune selected SNPs
    [sv sp] = sort(p1_ss);
    C0 = C(sp,sp);
    tr = tril(C0.^2,-1);
    mtr = max(tr,[],2);
    ss = sp(mtr<ld_trh);

    r1_ss = r1_ss(ss);
    r2_ss = r2_ss(ss);
    al1_ss = al1_ss(ss);
    al2_ss = al2_ss(ss);
    F0_ss = F0_ss(:,ss);
    n1_ss = n1_ss(ss);
    n2_ss = n2_ss(ss);
    ch0_ss = ch0_ss(ss);
    ps0_ss = ps0_ss(ss);
    rs0_ss = rs0_ss(ss);
    p1_ss = p1_ss(ss);
    m_eff = length(ss);
    nn = median(n2_ss);
    n_eff = nn;
    
    [~,top_ix] = min(p1_ss);
    r2_top(j) = r2_ss(top_ix);

    %%%%% calculate total explained variance of the selected SNPs
    C = corrcoef_NAN(F0_ss);
    C = (1-1e-10)*C + 1e-10*eye(m_eff);
    t = r2_ss'*(C\r2_ss);
    g = F0_ss*(C\r2_ss); % multi-SNP
    r2_multiSNP = (t-m_eff/nn)*(nn/(nn-m_eff));
    r_tmp = max(0,r2_multiSNP);
    r2_f0_SD = sqrt(((nn/(nn-m_eff))^2)*((4*(1-r_tmp)/nn)*r_tmp+2*m_eff*((1-r_tmp)/nn)^2));
    p_LR(j) = gamcdf_tail(nn*(t-r2_top(j).^2),(m_eff-1)/2,2);
    r2_top(j) = r2_top(j)^2-1/nn;
    
end


