// QUICKTEST: statistical analysis of uncertain genotypes
// Copyright 2007, 2008, Toby Johnson and Zoltan Kutalik

// QUICKTEST is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published
// by the Free Software Foundation; either version 3 of the License,
// or (at your option) any later version.

// QUICKTEST is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// The GNU General Public License can be viewed at
// http://www.gnu.org/licenses


// QUICKTEST was written by
//
// Toby Johnson           and   Zoltan Kutalik
// Toby.Johnson@unil.ch   and   Zoltan.Kutalik@unil.ch



#include "stats.h"

/////////////////////////////////////
//                                 //
// Zoltan's association test codes //
//                                 //
/////////////////////////////////////




int method_score (double v, double d, vector<double> pAB, vector<double> pBB, vector<double> y, 
		  double n, double sumyy,
		  double* beta, double* se, double* pval) {
  
  // U = score (first derivative of the log-likelihood at the origin)
  // I = observed Fisher information (negative inverse Hessian of the log-likelihood function) at the origin
  // effect size is estimated assuming a quadratic log-likelihood function
  // Update since previous version: H(beta,sigma) is non-zero

  double sP, sQ, Py, Qy;
  double P, Q, P2, Q2, PQ, Py2, Qy2, PQy2, Q2y2, P2y2;
  
  double sigma2 = sumyy/(d+n);



  sP = 0;
  sQ = 0;
  Py = 0;
  Qy = 0;
  P2 = 0;
  Q2 = 0;
  PQ = 0;
  Py2 = 0;
  Qy2 = 0;
  PQy2 = 0;
  Q2y2 = 0;
  P2y2 = 0;

  for (int i=0;i<n;++i){
    P = pAB[i];
    Q = pBB[i];
    sP+= P;
    sQ+= Q;
    Py+= P*y[i];
    Qy+= Q*y[i];
    P2+= P*P;
    PQ+= P*Q;
    Q2+= Q*Q;
    Py2+= P*y[i]*y[i];
    Qy2+= Q*y[i]*y[i];
    PQy2+= P*Q*y[i]*y[i];
    Q2y2+= Q*Q*y[i]*y[i];
    P2y2+= P*P*y[i]*y[i];
  }
  

  double sumXbar = sP+2*sQ;
  double Xbary = Py+2*Qy;
  double Xbary2 = (P2y2+4*PQy2+4*Q2y2-(Py2+4*Qy2)+sigma2*(sP+4*sQ));

  double U = Xbary/sigma2;
  double I = -(n*pow2(sigma2))/(2*pow2(Xbary)+pow2(sumXbar)*sigma2-n*Xbary2);

    *se = sqrt(I);
    *beta = U*I;
    // in pchisq, third argument zero means give *upper* tail probability
    *pval = pchisq(U*U*I,1,0,0);
		   
  return (1);
}


int method_newton_mix (vector<double> y, vector<double> pAA, vector<double> pAB, vector<double> pBB, 
		int n, int bigK, vector<double> pi, vector<double> mu, vector<double> sigma,
		double alphaNull,
		int* nIts, double* betaMLE, double* seApprox, double* pLRT) {

  const int maxit = 500;
  const int triggerEM = 20;
  const double tol = 1.e-6; // tolerance for change in alpha and beta
  const double lltol = 1.e-4; // tolerance for per-iteration increase in lnLik

  vector<double> pStarAA0 (bigK);
  vector<double> pStarAB0 (bigK);
  vector<double> pStarBB0 (bigK);
  vector<double> pStarAA (bigK);
  vector<double> pStarAB (bigK);
  vector<double> pStarBB (bigK);
  vector<double> pStarAA1 (bigK);
  vector<double> pStarAB1 (bigK);
  vector<double> pStarBB1 (bigK);
  vector<double> pStarAA2 (bigK);
  vector<double> pStarAB2 (bigK);
  vector<double> pStarBB2 (bigK);
  
  double alpha = alphaNull;
  double oldalpha = alpha;
  double beta = 0.;
  double oldbeta = 0.;
  double alpha_new, beta_new;


  double s2, w11=0, w12=0, w22=0, v0=0, v1=0, v2=0, sv1=0, sv2=0, sw11=0, sw12=0, sw22=0, L, L1=0, L0=0;

  int it; // need to know value after exit from loop....
  for (it=0; it<maxit; ++it) {

    //   cout << "quasi-newton it=" << it <<endl;

    L = 0;
    sv1 = 0;
    sv2 = 0;
    sw11 = 0;
    sw12 = 0;
    sw22 = 0;

    for (int i = 0; i < n; ++i) {
      
      v0 = 0;
      v1 = 0;
      v2 = 0;
      w11 = 0;
      w12 = 0;
      w22 = 0;

      for (int k = 0; k < bigK; ++k) {
	pStarAA[k] = pAA[i] * pi[k] * dnorm(y[i] - alpha, mu[k], sigma[k], 0);
	pStarAB[k] = pAB[i] * pi[k] * dnorm(y[i] - alpha - beta, mu[k], sigma[k], 0);
	pStarBB[k] = pBB[i] * pi[k] * dnorm(y[i] - alpha - 2.*beta, mu[k], sigma[k], 0);
	s2 = pow2(sigma[k]);
	pStarAA0[k] = pStarAA[k]/s2;
	pStarAB0[k] = pStarAB[k]/s2;
	pStarBB0[k] = pStarBB[k]/s2;

	pStarAA1[k] = pStarAA[k] * (y[i] - alpha -  mu[k])/s2;
	pStarAB1[k] = pStarAB[k] * (y[i] - alpha - beta - mu[k])/s2;
	pStarBB1[k] = pStarBB[k] * (y[i] - alpha - 2.*beta - mu[k])/s2;

	pStarAA2[k] = pStarAA1[k] * (y[i] - alpha -  mu[k])/s2;
	pStarAB2[k] = pStarAB1[k] * (y[i] - alpha - beta - mu[k])/s2;
	pStarBB2[k] = pStarBB1[k] * (y[i] - alpha - 2.*beta - mu[k])/s2;

	v0 += pStarAA[k] + pStarAB[k] + pStarBB[k];
	v1 += (pStarAA1[k] + pStarAB1[k] + pStarBB1[k]);
	v2 += (pStarAB1[k] + 2*pStarBB1[k]);

	w11 += pStarAA2[k]+pStarAB2[k]+pStarBB2[k]-pStarAA0[k]-pStarAB0[k]-pStarBB0[k];
	w12 += pStarAB2[k]+2*pStarBB2[k]-pStarAB0[k]-2*pStarBB0[k];
	w22 += pStarAB2[k]+4*pStarBB2[k]-pStarAB0[k]-4*pStarBB0[k];

      }

      v1 = v1/v0;
      v2 = v2/v0;
      w11 = w11/v0;
      w12 = w12/v0;
      w22 = w22/v0;

      sv1 += v1; 
      sv2 += v2;


      sw11 += w11 - pow2(v1);
      sw12 += w12 - v1*v2;
      sw22 += w22 - pow2(v2);
      L += log(v0);
    }

    if (it==0) {
      L0 = L;
    }

    double det_inv = 1/(sw11*sw22-pow2(sw12));
    double d_alpha = -det_inv*(sw22*sv1-sw12*sv2);
    double d_beta = -det_inv*(sw11*sv2-sw12*sv1);

    int j;
    for (j = 0; j < triggerEM; j++){
      alpha_new = alpha+d_alpha/pow(2,j);
      beta_new = beta+d_beta/pow(2,j);
      L1 = 0;

      for (int i = 0; i < n; ++i) {
	v0 = 0;
        for (int k = 0; k < bigK; ++k) {
	  pStarAA[k] = pAA[i] * pi[k] * dnorm(y[i] - alpha_new, mu[k], sigma[k], 0);
	  pStarAB[k] = pAB[i] * pi[k] * dnorm(y[i] - alpha_new - beta_new, mu[k], sigma[k], 0);
	  pStarBB[k] = pBB[i] * pi[k] * dnorm(y[i] - alpha_new - 2.*beta_new, mu[k], sigma[k], 0);
	  v0 += pStarAA[k] + pStarAB[k] + pStarBB[k];
	}
	L1 += log(v0);
      }

      if (L1>L) {
	alpha = alpha_new;
	beta = beta_new;
	break;
      }
    }
    

     // if it is stuck in a local minimum bring it closer to the attractor by the EM method: copied from Toby's code

    if (j >= triggerEM) {
      //      cout << "EM is helping out" <<endl;
      vector<double> weight (bigK);
      for (int k = 0; k < bigK; ++k) {
	weight[k] = pow2(sigma[0]/sigma[k]);
      }
      for (int em_it = 0; em_it<10; ++em_it) {
	++it;
	double sumw = 0.;
	double sumg = 0.;
	double sumy = 0.;
	double sumyg = 0.;
	double sumg2 = 0.;
	double sumyy = 0.;

	for (int i = 0; i < n; ++i) {
	  
	  double pStarSum = 0.;
	  for (int k = 0; k < bigK; ++k) {
	    pStarAA[k] = pAA[i] * pi[k] * dnorm(y[i] - alpha, mu[k], sigma[k], 0);
	    pStarAB[k] = pAB[i] * pi[k] * dnorm(y[i] - alpha - beta, mu[k], sigma[k], 0);
	    pStarBB[k] = pBB[i] * pi[k] * dnorm(y[i] - alpha - 2.*beta, mu[k], sigma[k], 0);
	    pStarSum += pStarAA[k] + pStarAB[k] + pStarBB[k];
	  }
	  
	  for (int k = 0; k < bigK; ++k) {
	    pStarAA[k] /= pStarSum;
	    pStarAB[k] /= pStarSum;
	    pStarBB[k] /= pStarSum;
	    double pStarAnyW = (pStarAA[k] + pStarAB[k] + pStarBB[k]) * weight[k];
	    
	    double eg = pStarAB[k] + 2.*pStarBB[k];
	    double eg2 = pStarAB[k] + 4.*pStarBB[k];
	    double ymmu = y[i]-mu[k];
	    
	    sumw += pStarAnyW;
	    sumg += eg * weight[k];
	    sumy += ymmu * pStarAnyW;
	    sumyg += ymmu * eg * weight[k];
	    sumg2 += eg2 * weight[k];
	    sumyy += pow2(ymmu) * pStarAnyW;
	  }
	}

	alpha = (sumy*sumg2/sumw - sumyg*sumg/sumw) / (sumg2 - pow2(sumg)/sumw);
	beta = (sumyg - sumy*sumg/sumw) / (sumg2 - pow2(sumg)/sumw);
      }
    }

    // EM-"call" ends here

    if (fabs(log(beta/oldbeta)+log(alpha/oldalpha)) < tol || (L1-L<lltol)) {
     //  && (fabs(log(beta/oldbeta)+log(alpha/oldalpha)) < lltol ))) {
      oldalpha = alpha;
      oldbeta = beta;
      break;
    } else {
      oldalpha = alpha;
      oldbeta = beta;
    }
  } // end for loop so ! it<maxit

  if (it >= maxit) {
    return(0);
  }
  
  if (it < maxit) {
    *nIts = it+1;
    *betaMLE = beta;
    *seApprox = beta/sqrt(2.*(L1-L0));
    *pLRT = pchisq(2.*(L1-L0), 1, 0, 0);    
    return(1);
  }
  return(0);
}


int precompute_score_mix (vector<double>* l1, vector<double>* l2, 
			  vector<double> y0, int n,
			  vector<double> pi, vector<double> mu, 
			  vector<double> sigma, int bigK) {

  // have to refer to elements of l1 and l2 using (*l1)[i] i=1..n etc.
  
  vector< vector<double> > *w = new vector< vector<double> >(n, vector<double>(bigK));

  for (int i=0; i<n; ++i) {
    double sw = 0;
    for (int k=0; k<bigK; ++k) {
      (*w)[i][k] = pi[k]*dnorm(y0[i], mu[k], sigma[k], 0);
      sw += (*w)[i][k];
    }
    for (int k=0; k<bigK; ++k) {
      (*w)[i][k] = (*w)[i][k]/sw;
    }
    double sl1 = 0;
    double sl2 = 0;
    for (int k=0; k<bigK; ++k) {
      double s2 = pow2(sigma[k]);
      sl1 += (*w)[i][k]*(y0[i] - mu[k])/s2;
      sl2 += (*w)[i][k]*(pow2(y0[i] - mu[k])/s2 - 1)/s2;
    }
    (*l1)[i] = sl1;
    (*l2)[i] = sl2;
  }

  return(1);
}

int method_score_mix (vector<double> pAB, vector<double> pBB, int n, vector<double> l1, vector<double> l2, double* beta, double* se, double* pval) {

  double U1 = 0, U2 = 0;
  double H11=0, H12=0, H22=0;

  for (int i=0; i<n; ++i){

    double tmp1 = (pAB[i]+2*pBB[i]);
    double tmp2 = tmp1+2*pBB[i];

    U1 += l1[i]*tmp1;
    U2 += l1[i];
    H11 += l2[i]*tmp2 - pow2(l1[i]*tmp1);
    H12 += l2[i]*tmp1 - pow2(l1[i])*tmp1;
    H22 += l2[i] - pow2(l1[i]);
  }

  double I11_inv = -H22/(H11*H22-pow2(H12));
  double I12_inv = H12/(H11*H22-pow2(H12));
  double I22_inv = -H11/(H11*H22-pow2(H12));

  *beta = U1*I11_inv+U2*I12_inv;
  *se = sqrt(I11_inv);
  double tmp = I11_inv*pow2(U1) + I12_inv*U1*U2 + I22_inv*pow2(U2);
  *pval = pchisq(tmp,1,0,0);

  return(1);
}

int method_score_mean_mix (vector<double> pAB, vector<double> pBB, int n, vector<double> l1, vector<double> l2, double* beta, double* se, double* pval) {

  double U1 = 0, U2 = 0;
  double H11=0, H12=0, H22=0;

  for (int i=0; i<n; ++i){

    double tmp1 = (pAB[i]+2*pBB[i]);

    U1 += l1[i]*tmp1;
    U2 += l1[i];
    H11 += l2[i]*pow2(tmp1) - pow2(l1[i]*tmp1);
    H12 += l2[i]*tmp1 - pow2(l1[i])*tmp1;
    H22 += l2[i] - pow2(l1[i]);
  }

  double I11_inv = -H22/(H11*H22-pow2(H12));
  double I12_inv = H12/(H11*H22-pow2(H12));
  double I22_inv = -H11/(H11*H22-pow2(H12));

  *beta = U1*I11_inv+U2*I12_inv;
  *se = sqrt(I11_inv);
  double tmp = I11_inv*pow2(U1) + I12_inv*U1*U2 + I22_inv*pow2(U2);
  *pval = pchisq(tmp,1,0,0);

  return(1);
}


///////////////////////////////////
//                               //
// Toby's association test codes //
//                               //
///////////////////////////////////

int method_em (vector<double> y, vector<double> pAA, vector<double> pAB, vector<double> pBB, 
	       int n, double sumyy, 
	       int* nIts, double* betaMLE, double* seApprox, double* pLRT) {

  const int maxit = 500;
  const double tol = 1.e-6;

  double pStarAA;
  double pStarAB;
  double pStarBB;
  double pStarSum;
  double eg,eg2;

  double alpha = 0.;
  double beta = 0.;
  double sigma0 = pow(sumyy / (double)n, 0.5);
  double sigma = sigma0;
  double oldbeta = 0.;

  double sumg;
  double sumyg;
  double sumg2;

  int it; // need to know value after exit from loop....
  for (it=0; it<maxit; ++it) {
    
    sumg = 0.;
    sumyg = 0.;
    sumg2 = 0.;

    for (int i = 0; i < n; ++i) {
      pStarAA = pAA[i] * dnorm(y[i] - alpha, 0, sigma, 0);
      pStarAB = pAB[i] * dnorm(y[i] - alpha - beta, 0, sigma, 0);
      pStarBB = pBB[i] * dnorm(y[i] - alpha - 2.*beta, 0, sigma, 0);
      
      pStarSum = pStarAA+pStarAB+pStarBB;
      pStarAA /= pStarSum;
      pStarAB /= pStarSum;
      pStarBB /= pStarSum;
      
      eg = pStarAB + 2.*pStarBB;
      eg2 = pStarAB + 4.*pStarBB;
      
      sumg += eg;
      sumyg += y[i]*eg;
      sumg2 += eg2;
    }
    
    beta = sumyg/(sumg2 - pow2(sumg)/(double)n);
    alpha = -beta * sumg/(double)n;
    sigma = pow((sumyy - pow2(beta)/(sumg2 - pow2(sumg)/(double)n))/double(n), 0.5);
    
    if (it>0) {
      if (fabs(log(beta/oldbeta)) < tol) {
	break;
      }
      oldbeta = beta;
    }
  }

  if (it >= maxit) {
    return(0);
  }

  double lldiff = 0.;
  for (int i=0; i<n; ++i) {
    lldiff += log(pAA[i] * dnorm(y[i]-alpha, 0, sigma, 0) 
		  + pAB[i] * dnorm(y[i]-alpha-beta, 0, sigma, 0) 
		  + pBB[i] * dnorm(y[i]-alpha-2.*beta, 0, sigma, 0)) - dnorm(y[i], 0, sigma0, 1);
  }

  if (it < maxit) {
    *nIts = it;
    *betaMLE = beta;
    *pLRT = pchisq(2.*lldiff, 1, 0, 0);
    //    *seApprox = fabs(*betaMLE / qnorm(*pLRT/2., 0, 1, 1, 0)); // lower.tail = TRUE, log.p = FALSE
    *seApprox = *betaMLE/sqrt(2.*lldiff);
    return(1);
  }
  return(0);
}
		


int method_call (vector<double> y, vector<int> callG,
		 int n, double priorDF, double ppbeta, double callThresh,
		 int* nCalled, double* beta, double* se, double* pval) {
  int i;
  int nAB = 0;
  int nBB = 0;
  double sumy = 0.;
  double sumyy = 0.; // we have to compute these ourselves because some individuals might be not-called 
  double sumyg = 0.;
  *nCalled = 0;
  double df;

  for (i = 0; i < n; ++i) {
    switch (callG[i]) {
    case 0:
      sumy += y[i];
      sumyy += pow2(y[i]);
      ++ *nCalled;
      break;
    case 1:
      sumy += y[i];
      sumyy += pow2(y[i]);
      ++nAB;
      sumyg += y[i];
      ++ *nCalled;
      break;
    case 2:
      sumy += y[i];
      sumyy += pow2(y[i]);
      ++nBB;
      sumyg += y[i]*2.;
      ++ *nCalled;
      break;
    default:
      // do nothing
      break;
    }
  }

  double sumg = (double)nAB + 2.*(double)nBB;
  // here we have to center sumy, sumyy and sumyg,
  //  because sumy *for the called individuals* may not be zero
  sumyy = sumyy - pow2(sumy)/(double)(*nCalled);
  sumyg = sumyg - sumy*sumg/(double)(*nCalled);
  // sumy = 0.; // never used again so no need to center this
  df = (double)(*nCalled) + priorDF;

  // here we add on ppbeta, so sumgg is really inv(V*)
  double sumgg;
  if (nAB == *nCalled || nBB == *nCalled || (nAB == 0 && nBB == 0)) {
    sumgg = 0.;
  } else {
    sumgg = (double)nAB + 4.*(double)nBB - pow2(sumg)/(double)(*nCalled) + ppbeta;
  }

  if (sumgg > 0. && df > 0.) {
    *beta = sumyg/sumgg;
    *se = sqrt((sumyy/sumgg - pow2(*beta))/df);
    *pval = 2.*pt(-fabs(*beta)/(*se), df, 1, 0);
    return (1);
  }
  return (0);
}

int precompute_mean (vector<double> pAB, vector<double> pBB, int n,
		     double meanAB, double meanBB, vector<double>* g, double* sumgg) {

  double eg = (meanAB + 2.*meanBB)/(double)n;
  *sumgg = 0.;
  for (int i = 0; i < n; ++i) {
    (*g)[i] = pAB[i] + 2.*pBB[i] - eg;
    *sumgg += pow2((*g)[i]);
  }
  return(0);
}

int method_mean (vector<double> y, vector<double> g, int n,
		 double sumgg, double sumyy, double priorDF, double ppbeta, 
		 double* beta, double* se, double* pval) {

  double sumyg = 0.;
  sumgg += ppbeta;
  double df = (double)(n) + priorDF;
  for (int i = 0; i < n; ++i) {
    sumyg += y[i]*g[i];
  }

  // no need to recenter because sumy exactly zero
  if (sumgg > 0. && df > 0.) {
    *beta = sumyg/sumgg;
    *se = sqrt((sumyy/sumgg - pow2(*beta))/df);
    *pval = 2.*pt(-fabs(*beta)/(*se), df, 1, 0);
    return (1);
  }
  return (0);
}

int method_sim (vector<double> y, vector<double> pAA, vector<double> pAAAB, int n,
		double sumyy, double priorDF, double ppbeta, int nsims, 
		int* nUsed, double* ebeta, double* sdbeta, double* pval, double* vWeight) {
  
  int sim;
  int i;
  *nUsed = 0;
  int nAB;
  int nBB;
  double r;
  double sumyg;
  double sumgg;
  double df = (double)n + priorDF; // df == dstar
  double thisEbeta;
  double thisVbeta;
  double weight;
  double sumEbeta = 0.;
  double sumEbeta2 = 0.;
  double sumPbeta = 0.;
  double sumWeight = 0.;
  double sumWeight2 = 0.;

  for (sim=0; sim < nsims; ++sim) {
    nAB = 0;
    nBB = 0;
    sumyg = 0.;
    
    for (i = 0; i < n; ++i) {
      r = unif_rand();
      if (r < pAA[i]) {
	// do nothing
      } else if (r < pAAAB[i]) {
	sumyg += y[i];
	++nAB;
      } else {
	sumyg += 2.*y[i];
	++nBB;
      }
    }

    // note, no need to update sumyg since mean(y)==0
    sumgg = (double)nAB + 4.*(double)nBB - pow2((double)nAB + 2.*(double)nBB) / n + ppbeta;

    if (sumgg > 0. && df > 2.) {
      thisEbeta = sumyg/sumgg;

      // in next line, note that 
      // posterior has variance like a*/(d*-2), 
      // so _additional_ -2 for invGamma beyond priorDF
      thisVbeta = (sumyy/sumgg - pow2(thisEbeta))/(df-2.);

      // is the weighted average numerically stable ??? 
      weight =  pow(sumgg, -.5) * pow(sumyy / (sumyy - sumgg*pow2(thisEbeta)), df/2.);


      //      cout << label << " " << weight << " " << thisEbeta << " " << thisVbeta << " " << pt(-fabs(thisEbeta)/sqrt(thisVbeta*(df-2.)/df), df, 1, 0) << endl;

      sumEbeta += thisEbeta * weight;
      sumEbeta2 += (thisVbeta + pow2(thisEbeta)) * weight;
      sumPbeta += pt(-thisEbeta/sqrt(thisVbeta*(df-2.)/df), df, 1, 0) * weight;
      sumWeight += weight;
      sumWeight2 += pow2(weight);

      ++ *nUsed;

    }
    // else do nothing
  }

  if (*nUsed > 0) {
    *ebeta = sumEbeta/sumWeight;
    *sdbeta = sqrt(sumEbeta2/sumWeight - pow2(*ebeta));
    double thisP = sumPbeta/sumWeight;
    //*pval = 2.*(thisP < 0.5 ? thisP : 1.-thisP);
    *pval = thisP;
    *vWeight = (sumWeight2 - pow2(sumWeight)/(double)(*nUsed))/(*nUsed);
    return (1);
  }
  return (0);
}

int synthetic (vector<double>* y, vector<double> yold, 
	       vector<double>* g, vector<double> pAA, vector<double> pAAAB, int n,
	       vector<int> iPerm, double beta, double* sumyy, double* sumgg) {
  
  double r;
  double sumy = 0.;
  double sumg = 0.;
  *sumyy = 0.;

  for (int i = 0; i < n; ++i) {
    // set equal to old phenotype
    (*y)[i] = yold[iPerm[i]];

    // simulate genotype and add genetic effect
    r = unif_rand();
    if (r < pAA[i]) {
      (*g)[i] = 0.;
      // do nothing to *y
    } else if (r < pAAAB[i]) {
      (*g)[i] = 1.;
      (*y)[i] += beta;
    } else {
      (*g)[i] = 2.;
      (*y)[i] += 2.*beta;
    }

    // update totals for moments
    sumg += (*g)[i];
    *sumgg += pow2((*g)[i]);
    sumy += (*y)[i];
    *sumyy += pow2((*y)[i]);
  }

  // center new genotype and phenotype
  double meang = sumg/(double)n;
  double meany = sumy/(double)n;
  for (int i = 0; i < n; ++i) {
    (*g)[i] -= meang;
    (*y)[i] -= meany;
  }
  // update second moments
  *sumgg -= (double)n*pow2(meang);
  *sumyy -= (double)n*pow2(meany);

  return (0);
}





int method_mcmc (vector<double> y, vector<double> pAA, vector<double> pAB, vector<double> pBB, int n,
		 double sumyy, double priorDF, double ppbeta, int nsims, int burnin,
		 int* nUsed, double* ebeta, double* sdbeta, double* pval) {
  
  // MCMC with alternate updates of (whole-genotype-vector) and (alpha,beta,sigma^2)
  // computes Rao---Blackwellised estimate of posterior for beta

  // simple version with single chain

  // uses data-dependent prior of Wasserman 2000
  // (J. R. Statist. Soc. B. 62(1):159--180), see also Diebolt and
  // Robert 1994 (J. R. Statist. Soc. B. 56(2):363--375)

  // gives up if four consecutive attempts to simulate genotypes are monomorphic

  // (iterations are 0...(nsims-1) and results are used from burnin...(nsims-1),#
  // therefore burnin is number of iterations not used

  // we set the return arguments to stupid values to alert user to any
  // return(0) situation not properly trapped by the calling code
  *ebeta = -999.;
  *sdbeta = -999.;
  *pval = -999.;

  int nAB;
  int nBB;
  double r;
  double sumyg;
  double sumg;
  double sumgg;
  double df = (double)n + priorDF; // df == dstar
  if (df <= 2.) {
    return (0);
  }
  double thisEbeta;
  double thisaStar;
  double thisVbeta;
  double thisPLeft;
  double sumEbeta = 0.;
  double sumEbeta2 = 0.;
  double sumPbeta = 0.;

  double alpha = 0.;
  double beta = 0.;
  double sigma2 = sumyy / (double)n;

  double pStarAA, pStarAB, pStarBB, pStarSum;
  int attempt;
  
  *nUsed = 0;

  for (int sim = 0; sim < nsims; ++sim) {
    
    nAB = 0;
    nBB = 0;
    sumyg = 0.;
    
    attempt = 0; do {
      if (++attempt > 4) return (0);
      for (int i = 0; i < n; ++i) {
	// save effort by using normal density up to irrelevant normalising constants
	pStarAA = pAA[i] * exp(-0.5*pow2(y[i] - alpha)/sigma2);
	pStarAB = pAB[i] * exp(-0.5*pow2(y[i] - alpha - beta)/sigma2);
	pStarBB = pBB[i] * exp(-0.5*pow2(y[i] - alpha - 2.*beta)/sigma2);
	  
	pStarSum = pStarAA+pStarAB+pStarBB;
	pStarAA /= pStarSum;
	pStarAB /= pStarSum;
	// pStarBB /= pStarSum; // we save effort by not computing this, since we never need it
	
	r = unif_rand();
	if (r < pStarAA) {
	  // do nothing
	} else if (r < pStarAA+pStarAB) {
	  sumyg += y[i];
	  ++nAB;
	} else {
	  sumyg += 2.*y[i];
	  ++nBB;
	}
      }
      // note, no need to update sumyg since mean(y)==0
      sumg = (double)nAB + 2.*(double)nBB;
      sumgg = (double)nAB + 4.*(double)nBB - pow2(sumg) / (double)n + ppbeta;
    } while (sumgg <= 0.);
      
    thisEbeta = sumyg/sumgg;
    thisaStar = sumyy-pow2(thisEbeta)*sumgg;
      
    // in next line, note that 
    // posterior has variance like a*/(d*-2), 
    // so _additional_ -2 for invGamma beyond priorDF
    thisVbeta = thisaStar/sumgg/(df-2.); // was (sumyy/sumgg - pow2(thisEbeta))/(df-2.);
      
    thisPLeft = pt(-thisEbeta/sqrt(thisVbeta*(df-2.)/df), df, 1, 0);
    // // code to generate verbose output tracking MCMC iterations
    //cout << "chain=" << chain;
    //cout << "sigma=" << sigma << " alpha=" << alpha << " beta=" << beta;
    //cout << " alpha+bar(x)beta=" << alpha+sumg/(double)n*beta;
    //cout << " Ebeta=" << thisEbeta << endl;
      
    sigma2 = 1. / rgamma(df/2.,2./thisaStar);
    beta = rnorm(thisEbeta, sqrt(sigma2/sumgg));
    alpha = -beta * sumg/(double)n + rnorm(0., sqrt(sigma2/(double)n));
      
    if (sim >= burnin) {
      sumEbeta += thisEbeta;
      sumEbeta2 += (thisVbeta + pow2(thisEbeta));
      sumPbeta += thisPLeft;
      ++(*nUsed);
    }
  
  }

  if (*nUsed > 0) {
    *ebeta = sumEbeta/(*nUsed);
    *sdbeta = sqrt(sumEbeta2/(*nUsed) - pow2(*ebeta));
    double thisP = sumPbeta/(*nUsed);
    *pval = 2.*(thisP < 0.5 ? thisP : 1.-thisP);
    return (1);
  }
  return (0);
}


int method_mc3 (vector<double> y, vector<double> pAA, vector<double> pAB, vector<double> pBB, int n,
		double sumyy, double priorDF, double ppbeta, int nsims, int nChains, 
		double* ebeta, double* sdbeta, double* pval, double* sqrtRhat) {
  
  // MCMC with alternate updates of (whole-genotype-vector) and (alpha,beta,sigma^2)
  // computes Rao---Blackwellised estimate of posterior for beta

  // multiple chains and sqrtRhat convergence diagnostic

  // uses data-dependent prior of Wasserman 2000
  // (J. R. Statist. Soc. B. 62(1):159--180), see also Diebolt and
  // Robert 1994 (J. R. Statist. Soc. B. 56(2):363--375)

  // gives up if four consecutive attempts to simulate genotypes are monomorphic

  // set the return arguments to stupid values to alert user to any return(0) situation
  // not properly trapped by the calling code
  *ebeta = -999.;
  *sdbeta = -999.;
  *pval = -999.;

  int nAB;
  int nBB;
  double r;
  double sumyg;
  double sumg;
  double sumgg;
  double df = (double)n + priorDF; // df == dstar
  if (df <= 2.) {
    return (0);
  }
  double thisEbeta;
  double thisaStar;
  double thisVbeta;
  double thisPLeft;
  double sumEbeta = 0.;
  double sumEbeta2 = 0.;
  double sumPbeta = 0.;

  int chain;
  vector<double> alpha (nChains, 0.);
  vector<double> beta (nChains, 0.);
  vector<double> sigma2 (nChains, sumyy / (double)n);
  double thisLnp;
  vector<double> lnp (nChains, 0.);
  vector<double> lnp2 (nChains, 0.);

  sumg = 0.;
  for (int i=0; i<n; ++i) {
    sumg += pAB[i]+2.*pBB[i];
  }
  
  // the problem is, how to think of some overdispersed initial conditions...
  beta[1] = -0.3 * sqrt(sigma2[0]);
  alpha[1] = -beta[1]*sumg/(double)n;
  beta[2] = 0.3 * sqrt(sigma2[0]);
  alpha[2] = -beta[2]*sumg/(double)n;

  double pStarAA, pStarAB, pStarBB, pStarSum;
  int attempt;
  
  
  int burnin = int(0.25*nsims);
  int nUsed = 0;

  for (int sim = 0; sim < nsims; ++sim) {
     
    for (chain = 0; chain < nChains; ++chain) {
      nAB = 0;
      nBB = 0;
      sumyg = 0.;
    
      attempt = 0; do {
	if (++attempt > 4) return (0);
	for (int i = 0; i < n; ++i) {
	  pStarAA = pAA[i] * exp(-0.5*pow2(y[i] - alpha[chain])/sigma2[chain]);
	  pStarAB = pAB[i] * exp(-0.5*pow2(y[i] - alpha[chain] - beta[chain])/sigma2[chain]);
	  pStarBB = pBB[i] * exp(-0.5*pow2(y[i] - alpha[chain] - 2.*beta[chain])/sigma2[chain]);
	  
	  pStarSum = pStarAA+pStarAB+pStarBB;
	  pStarAA /= pStarSum;
	  pStarAB /= pStarSum;
	  // pStarBB /= pStarSum; // we save effort by not computing this
	
	  r = unif_rand();
	  if (r < pStarAA) {
	    // do nothing
	  } else if (r < pStarAA+pStarAB) {
	    sumyg += y[i];
	    ++nAB;
	  } else {
	    sumyg += 2.*y[i];
	    ++nBB;
	  }
	}
	// note, no need to update sumyg since mean(y)==0
	sumg = (double)nAB + 2.*(double)nBB;
	sumgg = (double)nAB + 4.*(double)nBB - pow2(sumg) / (double)n + ppbeta;
      } while (sumgg <= 0.);
      
      thisEbeta = sumyg/sumgg;
      thisaStar = sumyy-pow2(thisEbeta)*sumgg;
      
      // in next line, note that 
      // posterior has variance like a*/(d*-2), 
      // so _additional_ -2 for invGamma beyond priorDF
      thisVbeta = thisaStar/sumgg/(df-2.); // was (sumyy/sumgg - pow2(thisEbeta))/(df-2.);
      
      thisPLeft = pt(-thisEbeta/sqrt(thisVbeta*(df-2.)/df), df, 1, 0);
      // // code to generate verbose output tracking MCMC iterations
      //cout << "chain=" << chain;
      //cout << "sigma=" << sigma << " alpha=" << alpha << " beta=" << beta;
      //cout << " alpha+bar(x)beta=" << alpha+sumg/(double)n*beta;
      //cout << " Ebeta=" << thisEbeta << endl;
      
      sigma2[chain] = 1. / rgamma(df/2.,2./thisaStar);
      beta[chain] = rnorm(thisEbeta, sqrt(sigma2[chain]/sumgg));
      alpha[chain] = -beta[chain] * sumg/(double)n + rnorm(0., sqrt(sigma2[chain]/(double)n));
      
      if (sim >= burnin) {
	sumEbeta += thisEbeta;
	sumEbeta2 += (thisVbeta + pow2(thisEbeta));
	sumPbeta += thisPLeft;
	++nUsed;
      }
      
      thisLnp = max(-pt(-thisEbeta/sqrt(thisVbeta*(df-2.)/df), df, 1, 1), 
		    -pt(-thisEbeta/sqrt(thisVbeta*(df-2.)/df), df, 0, 1));
      lnp[chain] += thisLnp;
      lnp2[chain] += pow2(thisLnp);
    }
    
  }

  *ebeta = sumEbeta/nUsed;
  *sdbeta = sqrt(sumEbeta2/nUsed - pow2(*ebeta));
  double thisP = sumPbeta/nUsed;
  *pval = 2.*(thisP < 0.5 ? thisP : 1.-thisP);
  // *pval = thisP; // if instead we wanted to report the left P-value


  // really horrible hack code to compute sqrtRhat,
  // using all iterations (thus conservative)
  double lnpAll = 0.;
  for (chain = 0; chain < nChains; ++chain) {
    lnpAll += lnp[chain];
  }
  double gmean = lnpAll / (double)(nsims*nChains);
  double Bon = 0.; double Wtm=0.;
  for (chain = 0; chain < nChains; ++chain) {
    double mean = lnp[chain]/(double)nsims;
    double var = (lnp2[chain]-pow2(lnp[chain])/(double)nsims) / (double)(nsims-1);
    Bon += pow2(mean-gmean);
    Wtm += var;
  }
  Bon /= ((double)nChains-1.);
  Wtm /= (double)nChains;
  *sqrtRhat = sqrt(Bon/Wtm + (double)(nsims-1) / (double)nsims);
  
  return (1);
}

int refit_mixture (vector<double> y, int n, double sumyy, int bigK, bool muZero,
		   vector<double>* pi, vector<double>* mu, vector<double>* sigma, 
		   bool verbose) {
  
  const int maxit = 500;
  const double tol = 1.e-8;
  const double lltol = 1.e-4;

  vector<double> sigma2 (bigK);
  for (int k=0; k<bigK; ++k) {
    sigma2[k] = pow2((*sigma)[k]);
  }

  vector<double> en (bigK);
  vector<double> esumy (bigK);
  vector<double> esumyy (bigK);
  vector<double> p (bigK);
  double sump;

  double lnlikNull = -(double)n*(0.5+M_LN_SQRT_2PI+log(sqrt(sumyy/(double)n)));
  double lnlik;
  double oldlnlik;

  int it; 
  double ssdiff; // need to know values of these after exit from loop....

  for (it=0; it<maxit; ++it) {
    
    for (int k = 0; k < bigK; ++k) {
      en[k] = 0.;
      esumy[k] = 0.;
      esumyy[k] = 0.;
    }

    for (int i = 0; i < n; ++i) {
      sump = 0.;
      for (int k = 0; k < bigK; ++k) {
	//	p[k] = (*pi)[k] * exp(-0.5*y2[i]/sigma2[k]) / (*sigma)[k];
	p[k] = (*pi)[k] * dnorm(y[i], (*mu)[k], (*sigma)[k], 0);
	sump += p[k];
      }
      for (int k = 0; k < bigK; ++k) {
	double thisp = p[k]/sump;
	en[k] += thisp;
	esumy[k] += thisp * y[i];
	esumyy[k] += thisp * pow2(y[i]);
      }
    }
    ssdiff = 0.;
    for (int k = 0; k < bigK; ++k) {
      (*pi)[k] = en[k]/(double)n;
      double oldsigma = (*sigma)[k];
      if (muZero) {
	sigma2[k] = esumyy[k]/en[k];
      } else {
	(*mu)[k] = esumy[k]/en[k];
	sigma2[k] = esumyy[k]/en[k] - pow2((*mu)[k]);
      }
      (*sigma)[k] = sqrt(sigma2[k]);
      ssdiff += (*pi)[k]*pow2((*sigma)[k]-oldsigma);
    }
    if (ssdiff < tol) break;
    
    if (it%100 == 0) {
      lnlik = 0.;
      for (int i = 0; i < n; ++i) {
	double f = 0.;
	for (int k=0; k<bigK; ++k) {
	  f += (*pi)[k] * dnorm(y[i], (*mu)[k], (*sigma)[k], 0);
	}
	lnlik += log(f);
      }
      if ((lnlik - oldlnlik)/100 < lltol) {
	break;
      }
      oldlnlik = lnlik;
    }
  }

  lnlik = 0.;
  for (int i = 0; i < n; ++i) {
    double f = 0.;
    for (int k = 0; k < bigK; ++k) {
      f += (*pi)[k] * dnorm(y[i], (*mu)[k], (*sigma)[k], 0);
    }
    lnlik += log(f);
  }

  if (it<maxit) {
    if (verbose) {
      cout << "# refitted mixture with [ " << bigK << " ] components in " << it << " steps" << endl;
      cout << "#   lnLik = " << lnlik << " ( versus " << lnlikNull << " for normal model )" << endl;
      cout << "#   BIC = " << 2.*(lnlikNull-lnlik)+log(n)*3.*(double)(bigK-1) << " versus normal model (lower is better)" << endl;
      for (int k = 0; k < bigK; ++k) {
	cout << "#   pi" << k+1 << " = " << (*pi)[k];
	cout << " mu" << k+1 << " = " << (*mu)[k];
	cout << " sigma" << k+1 << " = " << (*sigma)[k] << endl;
      }
    }
    return(1);
  } else {
    if (verbose) {
      cout << "# failed to fit mixture with [ " << bigK << " ] components" << endl;
      cout << "#   after " << maxit << " iterations of EM algorithm, WSSD = " << ssdiff << endl;
    }
    return(0);
  }
}


int fit_mixture (vector<double> y, int n, double sumyy, int bigK, bool muZero,
		 vector<double>* piBest, vector<double>* muBest, vector<double>* sigmaBest, 
		 io_spec* io, bool verbose) {
  
  const int maxstart = 50;
  const int maxit = 1000;

  vector<double> pi (bigK);
  vector<double> mu (bigK);
  vector<double> sigma (bigK);
  double sigma1 = sqrt(sumyy/(double)n);

  double lnlik;
  double lnlikBest = 0.;
  bool thisConv;
  bool convOnce = false;
  bool thisDegen;
  bool convDegen = false; // set if converged to degenerate solutions
  double muBad;
  set<double> muDegen;

  double lnlikNull = -(double)n*(0.5+M_LN_SQRT_2PI+log(sigma1));
  int startBest = -1;
  int itBest = -1;
  int thisIt = 0;

  double ssdiff; // need to know values of these after exit from loop....

  cout << "# fitting mixture with [ " << bigK << " ] components (this may take some time)" << endl;

  for (int start = 0; start < maxstart; ++start) {

    if (start == 0) {
      // for first start, choose parameters corresponding to the trivial mixture
      for (int k=0; k<bigK; ++k) {
	pi[k] = 1./bigK;
	mu[k] = 0.;
	sigma[k] = sigma1;
      }
    } else {
      // for subsequent restarts, choose parameters at random
      for (int k=0; k<bigK; ++k) {
	pi[k] = 1./bigK;
	if (muZero) {
	  mu[k] = 0.;
	} else {
	  mu[k] = rnorm(0.,sigma1);
	}
	sigma[k] = sigma1 / rgamma(2.,2.);
      }
      // Note, the MatLAB function uses random starting points
      // pi[k] = 1./bigK
      // mu[k] = y[rand_unif(n)] // i.e. a random observation
      // sigma[k] = sigma1
    }

    if (do_fit(y, n, &pi, &mu, &sigma, bigK, muZero, &thisDegen, &muBad, &thisConv, &thisIt, maxit)) {

      lnlik = 0.;
      for (int i = 0; i < n; ++i) {
	double f = 0.;
	for (int k = 0; k < bigK; ++k) {
	  f += pi[k] * dnorm(y[i], mu[k], sigma[k], 0);
	}
	lnlik += log(f);
      }
      if (start == 0 || lnlik>lnlikBest) {
	lnlikBest = lnlik;
	for (int k = 0; k < bigK; ++k) {
	  (*piBest)[k] = pi[k];
	  (*muBest)[k] = mu[k];
	  (*sigmaBest)[k] = sigma[k];
	}
	itBest = thisIt;
	startBest = start;
	convOnce = true;
      }
    } else {
      if (thisDegen) {
	convDegen = true;
	muDegen.insert(muBad);
      }
    }

    if (io->progress) {
      if (convOnce) {
	cout << "# after " << start+1 << " EM runs best lnLik = " << lnlikBest << "       " << "\r";
// 	for (int k = 0; k < bigK; ++k) {
// 	  cout << "#   pi" << k+1 << " = " << (*piBest)[k];
// 	  cout << " mu" << k+1 << " = " << (*muBest)[k];
// 	  cout << " sigma" << k+1 << " = " << (*sigmaBest)[k] << endl;
// 	}
      } else {
	cout << "# after " << start+1 << " EM runs no convergence !" << "       " << "\r";
      }
      fflush(stdout); 	
    }
  }
      
  if (convOnce) {
    if (verbose) {
      cout << "# fitted mixture with [ " << bigK << " ] components               " << endl;
      cout << "#   lnLik = " << lnlikBest << " ( versus " << lnlikNull << " for normal model )" << endl;
      cout << "#   BIC = " << 2.*(lnlikNull-lnlikBest)+log(n)*3.*(double)(bigK-1) << " versus normal model (lower is better)" << endl;
      cout << "#   found on run " << startBest+1 << "/" << maxstart << " after " << itBest << "/" << maxit << " EM iterations" << endl;
      for (int k = 0; k < bigK; ++k) {
	cout << "#   pi" << k+1 << " = " << (*piBest)[k];
	cout << " mu" << k+1 << " = " << (*muBest)[k];
	cout << " sigma" << k+1 << " = " << (*sigmaBest)[k] << endl;
      }
    }
    return(1);
  } else {
    if (verbose) {
      cout << "# failed to fit mixture with [ " << bigK << " ] components" << endl;
      cout << "#   after " << maxstart << " attempts using " << maxit << " iterations of EM algorithm" << endl;
      if (convDegen) {
	cout << "!" << endl;
	cout << "! warning : some attempts converged to degenerate solutions with sigma = 0" << endl;
	cout << "!         : mu =";
	for (set<double>::iterator idx = muDegen.begin(); idx != muDegen.end(); ++idx) {
	  cout << " " << *idx;
	}
	cout << endl;
	cout << "!         : note these values are calculated after centering the phenotype" << endl;
      }
    }
    return(0);
  }
}





int do_fit (vector<double> y, int n, 
	    vector<double>* pi, vector<double>* mu, vector<double>* sigma, int bigK,
	    bool muZero, bool* thisDegen, double* muBad, bool* thisConv, int* thisIt, int maxit) {

  const double tol = 1.e-8;
  const double lltol = 1.e-4;
  
  *thisDegen = false;
  *thisConv = true;
  
  vector<double> sigma2 (bigK);
  for (int k = 0; k < bigK; ++k) {
    sigma2[k] = pow2((*sigma)[k]);
  }

  vector<double> en (bigK);
  vector<double> esumy (bigK);
  vector<double> esumyy (bigK);
  vector<double> p (bigK);

  double lnlik;
  double oldlnlik = 0.;

  for (int i = 0; i < n; ++i) {
    double f = 0.;
    for (int k=0; k<bigK; ++k) {
      f += (*pi)[k] * dnorm(y[i], (*mu)[k], (*sigma)[k], 0);
    }
    oldlnlik += log(f);
  }

  for (int it=0; it<maxit; ++it) {

    for (int k = 0; k < bigK; ++k) {
      en[k] = 0.;
      esumy[k] = 0.;
      esumyy[k] = 0.;
    }
    
    for (int i = 0; i < n; ++i) {
      double lnsump = log(0.);
      for (int k = 0; k < bigK; ++k) {
	p[k] = log((*pi)[k]) + dnorm(y[i], (*mu)[k], (*sigma)[k], 1);
	lnsump = logspace_add(lnsump, p[k]);
      }

      for (int k = 0; k < bigK; ++k) {
	double thisp = exp(p[k] - lnsump);
	en[k] += thisp;
	esumy[k] += thisp * y[i];
	esumyy[k] += thisp * pow2(y[i]);
      }
    }

    double ssdiff = 0.;
    for (int k = 0; k < bigK; ++k) {
      (*pi)[k] = en[k]/(double)n;
      double oldsigma = (*sigma)[k];
      if (muZero) {
	sigma2[k] = esumyy[k]/en[k];
      } else {
	(*mu)[k] = esumy[k]/en[k];
	sigma2[k] = esumyy[k]/en[k] - pow2((*mu)[k]);
      }
      (*sigma)[k] = sqrt(sigma2[k]);
      if ((*sigma)[k] == 0.) {
	*thisDegen = true;
	*muBad = (*mu)[k];
	return(0);
      }
      ssdiff += (*pi)[k]*pow2((*sigma)[k]-oldsigma);
    }
    if (ssdiff < tol) {
      *thisConv = true;
      *thisIt = it+1;
      return(1);
    }

    if (it > 0 && it%100 == 0) {

      lnlik = 0.;
      for (int i = 0; i < n; ++i) {
	double f = 0.;
	for (int k=0; k<bigK; ++k) {
	  f += (*pi)[k] * dnorm(y[i], (*mu)[k], (*sigma)[k], 0);
	}
	lnlik += log(f);
      }
      if ((lnlik - oldlnlik)/100 < lltol) {
	*thisConv = true;
	*thisIt = it+1;
	return(1);
      }
      oldlnlik = lnlik;
    }
  }

  return(0);
}













int method_em_mix (vector<double> y, vector<double> pAA, vector<double> pAB, vector<double> pBB, 
		int n, int bigK, vector<double> pi, vector<double> mu, vector<double> sigma,
		double alphaNull,
		int* nIts, double* betaMLE, double* lnlik, double* pLRT) {

  const int maxit = 500;
  const double tol = 1.e-6; // tolerance for change in alpha and beta
  const double lltol = 1.e-4; // tolerance for per-iteration increase in lnLik

  vector<double> pStarAA (bigK);
  vector<double> pStarAB (bigK);
  vector<double> pStarBB (bigK);

  vector<double> weight (bigK);
  for (int k = 0; k < bigK; ++k) {
    weight[k] = pow2(sigma[0]/sigma[k]);
  }
  
  double pStarSum, pStarAnyW, ymmu;
  double eg,eg2;
  
  double alpha = alphaNull;
  double oldalpha = alpha;
  double beta = 0.;
  double oldbeta = 0.;
  double oldlnlik = 0.;

  double sumw;
  double sumg;
  double sumy;
  double sumyg;
  double sumg2;
  double sumyy;

  double f;

  int it; // need to know value after exit from loop....
  for (it=0; it<maxit; ++it) {

    sumw = 0.;
    sumg = 0.;
    sumy = 0.;
    sumyg = 0.;
    sumg2 = 0.;
    sumyy = 0.;

    for (int i = 0; i < n; ++i) {
      
      pStarSum = 0.;
      for (int k = 0; k < bigK; ++k) {
	pStarAA[k] = pAA[i] * pi[k] * dnorm(y[i] - alpha, mu[k], sigma[k], 0);
	pStarAB[k] = pAB[i] * pi[k] * dnorm(y[i] - alpha - beta, mu[k], sigma[k], 0);
	pStarBB[k] = pBB[i] * pi[k] * dnorm(y[i] - alpha - 2.*beta, mu[k], sigma[k], 0);
	pStarSum += pStarAA[k] + pStarAB[k] + pStarBB[k];
      }
      
      for (int k = 0; k < bigK; ++k) {
	pStarAA[k] /= pStarSum;
	pStarAB[k] /= pStarSum;
	pStarBB[k] /= pStarSum;
	pStarAnyW = (pStarAA[k] + pStarAB[k] + pStarBB[k]) * weight[k];
	
	eg = pStarAB[k] + 2.*pStarBB[k];
	eg2 = pStarAB[k] + 4.*pStarBB[k];
	ymmu = y[i]-mu[k];
	
	sumw += pStarAnyW;
	sumg += eg * weight[k];
	sumy += ymmu * pStarAnyW;
	sumyg += ymmu * eg * weight[k];
	sumg2 += eg2 * weight[k];
	sumyy += pow2(ymmu) * pStarAnyW;
      }
    }

    alpha = (sumy*sumg2/sumw - sumyg*sumg/sumw) / (sumg2 - pow2(sumg)/sumw);
    beta = (sumyg - sumy*sumg/sumw) / (sumg2 - pow2(sumg)/sumw);
    
    //double re = pow2(alpha)*sumw + 2*alpha*beta*sumg + pow2(beta)*sumg2;
    //Grandsigma = sqrt((sumyy - re)/(double)n);
    
    if (it > 0) {
      // fix in 4.1 make fabs()+fabs()
      if (fabs(log(beta/oldbeta)+log(alpha/oldalpha)) < tol) {
	break;
      }
      oldalpha = alpha;
      oldbeta = beta;

      if (it%100 == 0) {
	*lnlik = 0.;
	for (int i=0; i<n; ++i) {
	  f = 0.;
	  for (int k = 0; k < bigK; ++k) {
	    f += pAA[i] * pi[k] * dnorm(y[i] - alpha, mu[k], sigma[k], 0);
	    f += pAB[i] * pi[k] * dnorm(y[i] - alpha - beta, mu[k], sigma[k], 0);
	    f += pBB[i] * pi[k] * dnorm(y[i] - alpha - 2.*beta, mu[k], sigma[k], 0);
	  }
	  *lnlik += log(f);
	  f = 0.;
	  for (int k = 0; k < bigK; ++k) {
	    f += pi[k] * dnorm(y[i] - alphaNull, mu[k], sigma[k], 0);
	  }
	  *lnlik -= log(f);
	}
	if (*lnlik - oldlnlik < -1e-5) {
	  // need to think a bit about what is an okay tolerance
	  // and maybe print the value
	  cerr << "!" << endl;
	  cerr << "! lnLik decreased by " << oldlnlik - *lnlik << " during EM iteration. THIS IS BAD." << endl;
	  cerr << "!" << endl;
	  return(0);
	}	
	if ((*lnlik - oldlnlik)/100. < lltol) {
	  break;
	}
	oldlnlik = *lnlik;
      }
    }
  } // end for loop so ! it<maxit

  if (it >= maxit) {
    return(0);
  }

  *lnlik = 0.;
  for (int i=0; i<n; ++i) {
    f = 0.;
    for (int k = 0; k < bigK; ++k) {
      f += pAA[i] * pi[k] * dnorm(y[i] - alpha, mu[k], sigma[k], 0);
      f += pAB[i] * pi[k] * dnorm(y[i] - alpha - beta, mu[k], sigma[k], 0);
      f += pBB[i] * pi[k] * dnorm(y[i] - alpha - 2.*beta, mu[k], sigma[k], 0);
    }
    *lnlik += log(f);
    f = 0.;
    for (int k = 0; k < bigK; ++k) {
      f += pi[k] * dnorm(y[i] - alphaNull, mu[k], sigma[k], 0);
    }
    *lnlik -= log(f);
  }
  
  if (it < maxit) {
    *nIts = it;
    *betaMLE = beta;
    *pLRT = pchisq(2.*(*lnlik), 1, 0, 0);    
    return(1);
  }
  return(0);
}
		

int method_mean_mix (vector<double> y, vector<double> g,
		int n, int bigK, vector<double> pi, vector<double> mu, vector<double> sigma,
		double alphaNull,
		int* nIts, double* betaMLE, double* se, double* pLRT) {

  const int maxit = 500;
  const double tol = 1.e-6; // tolerance for change in alpha and beta
  const double lltol = 1.e-4; // tolerance for per-iteration increase in lnLik

  vector<double> pStar (bigK);

  vector<double> weight (bigK);
  for (int k = 0; k < bigK; ++k) {
    weight[k] = pow2(sigma[0]/sigma[k]);
  }
  
  double pStarSum, pStarW, ymmu;
  vector<double> g2 (n);
  for (int i = 0; i < n; ++i) {
    g2[i] = pow2(g[i]);
  }
  
  double alpha = alphaNull;
  double oldalpha = alpha;
  double beta = 0.;
  double oldbeta = 0.;
  double lnlik = 0.;
  double oldlnlik = 0.;

  double sumw;
  double sumg;
  double sumy;
  double sumyg;
  double sumg2;
  double sumyy;

  double f;

  int it; // need to know value after exit from loop....
  for (it=0; it<maxit; ++it) {

    sumw = 0.;
    sumg = 0.;
    sumy = 0.;
    sumyg = 0.;
    sumg2 = 0.;
    sumyy = 0.;

    for (int i = 0; i < n; ++i) {
      
      pStarSum = 0.;
      for (int k = 0; k < bigK; ++k) {
	pStar[k] = pi[k] * dnorm(y[i] - alpha - g[i]*beta, mu[k], sigma[k], 0);
	pStarSum += pStar[k];
      }
      
      for (int k = 0; k < bigK; ++k) {
	pStar[k] /= pStarSum;
	pStarW = pStar[k] * weight[k];
	
	ymmu = y[i]-mu[k];
	
	sumw += pStarW;
	sumg += g[i] * pStarW; //weight[k];
	sumy += ymmu * pStarW;
	sumyg += ymmu * g[i] * pStarW; // weight[k];
	sumg2 += g2[i] * pStarW; // weight[k];
	sumyy += pow2(ymmu) * pStarW;
      }
    }

    alpha = (sumy*sumg2/sumw - sumyg*sumg/sumw) / (sumg2 - pow2(sumg)/sumw);
    beta = (sumyg - sumy*sumg/sumw) / (sumg2 - pow2(sumg)/sumw);
    
    //double re = pow2(alpha)*sumw + 2*alpha*beta*sumg + pow2(beta)*sumg2;
    //Grandsigma = sqrt((sumyy - re)/(double)n);
    
    if (it > 0) {
      if (fabs(log(beta/oldbeta))+fabs(log(alpha/oldalpha)) < tol) {
	break;
      }
      oldalpha = alpha;
      oldbeta = beta;

      if (it%100 == 0) {
	lnlik = 0.;
	for (int i=0; i<n; ++i) {
	  f = 0.;
	  for (int k = 0; k < bigK; ++k) {
	    f += pi[k] * dnorm(y[i] - alpha - g[i]*beta, mu[k], sigma[k], 0);
	  }
	  lnlik += log(f);
	  f = 0.;
	  for (int k = 0; k < bigK; ++k) {
	    f += pi[k] * dnorm(y[i] - alphaNull, mu[k], sigma[k], 0);
	  }
	  lnlik -= log(f);
	}
	if (lnlik - oldlnlik < -1e-5) {
	  // need to think a bit about what is an okay tolerance
	  // and maybe print the value
	  cerr << "!" << endl;
	  cerr << "! lnLik decreased by " << oldlnlik - lnlik << " during EM iteration. THIS IS BAD." << endl;
	  cerr << "!" << endl;
	  return(0);
	}	
	if ((lnlik - oldlnlik)/100. < lltol) {
	  break;
	}
	oldlnlik = lnlik;
      }
    }
  } // end for loop so ! it<maxit

  if (it >= maxit) {
    return(0);
  }

  lnlik = 0.;
  for (int i=0; i<n; ++i) {
    f = 0.;
    for (int k = 0; k < bigK; ++k) {
      f += pi[k] * dnorm(y[i] - alpha - g[i]*beta, mu[k], sigma[k], 0);
    }
    lnlik += log(f);
    f = 0.;
    for (int k = 0; k < bigK; ++k) {
      f += pi[k] * dnorm(y[i] - alphaNull, mu[k], sigma[k], 0);
    }
    lnlik -= log(f);
  }
  
  if (it < maxit) {
    *nIts = it;
    *betaMLE = beta;
    *se = beta/sqrt(2.*lnlik);
    *pLRT = pchisq(2.*lnlik, 1, 0, 0);    
    return(1);
  }
  return(0);
}
		


int method_mix_null (vector<double> y, int n, 
		     int bigK, vector<double> piInit, vector<double> mu, vector<double> sigma,
		     int* nIts, double* alphaMLE, double* lnlik,
		     bool verbose) {

  const int maxit = 500;
  const double tol = 1.e-6;

  vector<double> pi (bigK);
  vector<double> weight (bigK);
  for (int k = 0; k < bigK; ++k) {
    pi[k] = piInit[k];
    weight[k] = pow2(sigma[0]/sigma[k]);
  }
  
  vector<double> pStar (bigK);
  double pStarSum;
  
  double alpha = 0.;
  double oldalpha = 0.;

  double sumw;
  double sumy;

  int it; // need to know value after exit from loop....
  for (it=0; it<maxit; ++it) {

    sumw = 0.;
    sumy = 0.;

    for (int i = 0; i < n; ++i) {
      
      pStarSum = 0.;
      for (int k = 0; k < bigK; ++k) {
	pStar[k] = pi[k] * dnorm(y[i] - alpha, mu[k], sigma[k], 0);
	pStarSum += pStar[k];
      }
      
      for (int k = 0; k < bigK; ++k) {
	pStar[k] /= pStarSum;
	
	sumw += pStar[k] * weight[k];
	sumy += (y[i]-mu[k]) * pStar[k] * weight[k];
      }
    }
    
    alpha = sumy/sumw;

    if (it>0) {
      if (fabs(log(alpha/oldalpha)) < tol) {
	break;
      }
      oldalpha = alpha;
    }
  }
  if (it >= maxit) {
    return(0);
  }

  *lnlik = 0.;
   double f;
  for (int i=0; i<n; ++i) {
    f = 0.;
    for (int k = 0; k < bigK; ++k) {
      f += pi[k] * dnorm(y[i] - alpha, mu[k], sigma[k], 0);
    }
    *lnlik += log(f);
    f = 0.;
    for (int k = 0; k < bigK; ++k) {
      f += piInit[k] * dnorm(y[i], mu[k], sigma[k], 0);
    }
    *lnlik -= log(f);
  }
  
  if (it < maxit) {
    *nIts = it;
    *alphaMLE = alpha;

    if (verbose) {
      cout << "# estimated null alpha = " << alpha;
      cout << " ( increase in lnLik = " << *lnlik << " vs. alpha = 0 )" << endl;
    }
    
    return(1);
  }
  return(0);
}
		
int method_Pettitt (vector<int> yRank, vector<double> g, int n,
		    double* c, double* a,
		    double* beta, double* se, double* pval) {
  
  double XtCX = 0.;
  double Xta = 0.;
  for (int i = 0; i < n; ++i) {
    Xta += g[i]*a[yRank[i]];
    XtCX += g[i]*(1.-c[yRank[i]*n+yRank[i]])*g[i];
    for (int j = i+1; j < n; ++j) {
      XtCX += 2.*g[i]*c[yRank[i]*n+yRank[j]]*g[j];
    }
  }
  if (XtCX > 0) {
    *beta = Xta/XtCX;
    *se = sqrt(((double)n/XtCX - pow2(*beta))/(double)(n-2));
    *pval = 2.*pt(-fabs(*beta)/(*se), (double)(n-2), 1, 0);
    return (1);
  }
  return (0);
}


int preprocess_phenotype (io_spec* io, double* yRaw, int n, 
			  double* vars, double* vars2, double* varty, int nCov, int nVar,
			  vector<double>* yResid, vector<double>* yNorm, vector<int>* yRank, 
			  double* sumyyOrig, double* sumyyResid, double* sumyyNorm) {

  vector<pheno> ySort (n); // used twice

  // paranoid check that n > 0
  if (n <= 0) {
    cerr << "!" << endl;
    cerr << "! fatal error: number of non-missing phenotypes is zero" << endl;
    cerr << "!" << endl;
    return (0);
  }

  cout << "# phenotype [ " << io->nPheno << " ] ";
  print_summary(yRaw, n);
  *sumyyOrig = 0.;
  for (int i = 0; i < n; ++i) {
    *sumyyOrig += pow2(yRaw[i]);
  } // don't care that we just computed this twice, the computer is fast

  // even when not quantile-normalising, 
  // sort the phenotype to check for tied values
  if (!io->ignoreTies) {

    // construct index-value pairs for sorting, and sort
    for (int i = 0; i < n; ++i) {
      ySort[i].idx = i;
      ySort[i].val = yRaw[i];
    }
    sort(ySort.begin(), ySort.end(), phenoLess());
    
    // here the variable `ties' is the number of tied values,
    //    thus ties==1 means a single (untied) value
    bool tieWarning = false; // whether to print message explaining warning
    for (int i = 0, ties; i < n; i += ties) {
      ties = 1;
      while (i+ties < n && ySort[i+ties].val == ySort[i].val) ++ties;
      if (ties > 1) {
	if (!tieWarning) {
	  cout << "!" << endl;
	}
	cout << "! warning : " << ties << " individuals have identical phenotype " << ySort[i].val << endl;
	tieWarning = true;
      }
    }
    if (tieWarning) {
      cout << "! warning : are you coding missing data using `special' values ?" << endl;
      cout << "!         : see documentation for --missing-code and --ignore-ties options" << endl;
      cout << "!" << endl;
    }
  }
  
  // summarise covariates and make residuals using normal linear model
  { int ii = 0;
    for (list<string>::iterator i = io->nCovar->begin(); i != io->nCovar->end(); ++i) {
      cout << "# covariate [ " << *i << " ] ";
      // die if (ii >= nCov)
      print_summary(&(vars[ii*n]), n);
      ++ii;
    }
  }
  
  double* tmpResid = new double [n];
  if (!normal_residuals (tmpResid, yRaw, n, vars, vars2, varty, nCov+1, nVar, sumyyResid)) {
    cerr << "!" << endl;
    cerr << "! error : could not compute residual phenotype" << endl;
    cerr << "!       : check for colinear covariates" << endl;
    cerr << "!" << endl;
    return (0);
  }

  cout << "# residual phenotype ";
  print_summary(tmpResid, n);
  
  // assign to yResid
  for (int i = 0; i < n; ++i) {
    (*yResid)[i] = tmpResid[i];
  }

  // construct index-value pairs for sorting, sort residuals this time
  for (int i = 0; i < n; ++i) {
    ySort[i].idx = i;
    ySort[i].val = (*yResid)[i];
  }
  sort(ySort.begin(), ySort.end(), phenoLess());

  // compute (approx) IQR and thus necessary scaling
  double normScale = (ySort[int(.75*n)].val - ySort[int(.25*n)].val) * 0.7413011;

  // here the variable `ties' is the number of tied values,
  //    thus ties==1 means a single (untied) value
  for (int i = 0, ties; i < n; i += ties) {
    ties = 1;
    while (i+ties < n && ySort[i+ties].val == ySort[i].val) ++ties;
    //    if (ties > 1) tieWarning = true;
    // compute ranks and thus quantile-normalised values, using mean ranks for ties
    for (int j = 0; j < ties; ++j) {
      (*yNorm)[ySort[i+j].idx] = qnorm(((double)(2*i+ties-1)*0.5 + 0.625)/((double)n + 0.25), 0, 1, 1, 0) * normScale;
    }
  }
  // compute sumyyNorm
  *sumyyNorm = 0.;
  for (int i = 0; i < n; ++i) *sumyyNorm += pow2((*yNorm)[i]);
  // put unstable ranks into this vector for Pettitt rank-based method
  for (int i = 0; i < n; ++i) (*yRank)[ySort[i].idx] = i+1;
 
  if (io->quantileNormalise) {
    cout << "# performed quantile-normalisation with scale " << normScale << endl;
    cout << "# quantile-normalised phenotype";
    print_summary(*yNorm, n);
  }

  //  if (tieWarning) {
  //  cout << "! warning : ranks method bad idea" << endl;
  //  cout << "!" << endl;
  //}

  return(1);
}


// streamlined code to qq norm a phenotype, no checks and no messages
int fast_qq_norm (vector<double>* yRaw, int n, vector<double>* yNorm, double* sumyyNorm) {

  // construct index-value pairs for sorting
  vector<pheno> ySort (n);
  for (int i = 0; i < n; ++i) {
    ySort[i].idx = i;
    ySort[i].val = (*yRaw)[i];
  }

  // sort by phenotype value; we will recover permutation by looking
  // at associated indices
  sort(ySort.begin(), ySort.end(), phenoLess());

  // compute (approx) IQR and thus necessary scaling
  double normScale = (ySort[int(.75*n)].val - ySort[int(.25*n)].val) * 0.7413011;

  // probably there would be a fast and accurate guess for this
  *sumyyNorm = 0;

  // here the variable `ties' is the number of tied values,
  //    thus ties==1 means a single (untied) value
  for (int i = 0, ties; i < n; i += ties) {
    ties = 1;
    while (i+ties < n && ySort[i+ties].val == ySort[i].val) ++ties;
    // compute ranks and thus quantile-normalised values, using mean ranks for ties
    for (int j = 0; j < ties; ++j) {
      (*yNorm)[ySort[i+j].idx] = qnorm(((double)(2*i+ties-1)*0.5 + 0.625)/((double)n + 0.25), 0, 1, 1, 0) * normScale;
      *sumyyNorm += pow2((*yNorm)[ySort[i+j].idx]);
    }
  }

  return(1);
}

double compute_alphaHat (vector<double> p0, vector<double> p1, vector<double> p2,
			 double meanAB, double meanBB, int n) {

  // method of moments estimator, adapted from Narayanan AS266

  double sump0sq = 0.;
  double sump1sq = 0.;
  double sump2sq = 0.;
  
  for (int i = 0; i<n; ++i) {
    sump0sq += pow2(p0[i]);
    sump1sq += pow2(p1[i]);
    sump2sq += pow2(p2[i]);
  }

  double meanAA = (double)n-meanAB-meanBB;

  double alpha = ((meanAA)*((meanAA) - sump0sq)/(sump0sq - pow2(meanAA)/(double)n) +
		  (meanAB)*((meanAB) - sump1sq)/(sump1sq - pow2(meanAB)/(double)n) +
		  (meanBB)*((meanBB) - sump2sq)/(sump2sq - pow2(meanBB)/(double)n))/(double)n;
  return (alpha);
}






int method_interaction (vector<double> y, vector<double> g, 
			double* vars, double* vars2, double* varty, 
			int nn, int nCov, double sumyy, 
			double* betaInt, double* seInt, double* pvalInt) {

  int nVar = nCov+3; // +intercept+G+GE
  int nVar2 = nVar*nVar;
  // make X'X and X'y, assuming g, e and y are all centered

  double df = (double)(nn) - nVar;
  if (df < 1) return(0);

  double* xtx = new double [nVar2];
  for (int ii = 0; ii < nVar2; ++ii) xtx[ii] = vars2[ii];
  
  double* xty = new double [nVar];
  for (int ii = 0; ii < nVar; ++ii) xty[ii] = varty[ii];

  //  0       a ..... a
  //  .       .       .
  //  ncov    a ..... a
  //          1 ... 1 2 3
  //          4 ... 4 5 6 7

  for (int i = 0; i < nn; ++i) {
    double ge = g[i]*vars[i]; // E is first column of vars
    for (int ii = 0; ii < nCov; ++ii) {
      xtx[ii*nVar + nCov+1] += g[i] * vars[ii*nn + i]; // [1]
      xtx[ii*nVar + nCov+2] += ge * vars[ii*nn + i]; // [4]
    }
    xtx[nCov*nVar + nCov+1] += g[i]; // [2]
    xtx[(nCov+1)*nVar + nCov+1] += pow2(g[i]); // [3]
    xtx[nCov*nVar + nCov+2] += ge; // [5]
    xtx[(nCov+1)*nVar + nCov+2] += g[i] * ge; // [6]
    xtx[(nCov+2)*nVar + nCov+2] += pow2(ge); // [7]
    xty[nCov+1] += y[i]*g[i]; // (2)-th
    xty[nCov+2] += y[i]*ge; // (4)-th
  }

  //for (int ii = 0; ii < nVar; ++ii) {
  //  for (int jj = 0; jj < nVar; ++jj) {
  //    cout << "  " << xtx[ii*nVar + jj];
  //  }
  //  cout << "\t\t= " << xty[ii] << endl;
  //}


  
  { // deliberately shadowing n and y, probably a very bad idea

    // arguments for dposv and dsymv (some are shared)
    char uplo = 'L';
    int blasInt2 = 2;
    double* a = new double [nVar2]; 
    int lda = nVar;
    double* b = new double [nVar*2];
    int ldb = nVar;
    int info = 0;
    double alpha = 1.;
    int blasInt1 = 1;
    double beta = 0.;
    double* y = new double [nVar];
    
    for (int ii = 0; ii < nVar2; ++ii) a[ii] = xtx[ii]; // copy because destroyed on call
    // use two rhs at same time
    for (int ii = 0; ii < nVar; ++ii) b[ii] = xty[ii]; // copy because destroyed on call
    for (int ii = 0; ii < nVar; ++ii) b[nVar+ii] = 0.;
    b[nVar+nCov+2] = 1.; // elementary vector for GE coefficient
    dposv_(&uplo, &nVar, &blasInt2, a, &nVar, b, &nVar, &info);
    if (info) return(0);
    double* theta = new double [nVar]; // regression parameter
    for (int ii = 0; ii < nVar; ++ii) theta[ii] = b[ii];
    double v = b[nVar+nCov+2]; 

    for (int ii = 0; ii < nVar2; ++ii) a[ii] = xtx[ii];
    for (int ii = 0; ii < nVar; ++ii) y[ii] = 0.; // is this needed?
    // y is a stupid name for t(X) %*% beta
    dsymv_(&uplo, &nVar, &alpha, a, &nVar, b, &blasInt1, &beta, y, &blasInt1);
    
    double ss = sumyy; // sum squared residuals
    for (int i = 0; i < nVar; ++i) ss -= y[i]*theta[i];
    
    if (info) return(0);
    
    *betaInt = theta[nCov+2];
    *seInt = sqrt(v * ss/df);
    *pvalInt = 2.*pt(-fabs(*betaInt)/(*seInt), df, 1, 0);

    delete [] theta;
    delete [] y;
    delete [] b;
    delete [] a;
  }

  delete [] xty;
  delete [] xtx;
  
  return(1);
}

int method_robust (vector<double> y, vector<double> g, 
			double* vars, double* vars2, double* varty, 
			int nn, int nCov, double sumyy, 
			double* betaInt, double* seInt, double* pvalInt,double* betasnp, double* sesnp, double* pvalsnp, double* covsnpinteraction) {

  int nVar = nCov+3; // +intercept+G+GE
  int nVar2 = nVar*nVar;
  // make X'X and X'y, assuming g, e and y are all centered

  double df = (double)(nn) - nVar;
  if (df < 1) return(0);

  double* xtx = new double [nVar2];
  for (int ii = 0; ii < nVar2; ++ii) xtx[ii] = vars2[ii];
  
  double* xty = new double [nVar];
  for (int ii = 0; ii < nVar; ++ii) xty[ii] = varty[ii];
  
  double* xx = new double [nVar*nn];
  for (int ii = 0; ii < nVar*nn; ++ii) xx[ii] = vars[ii];
  
  double* yy = new double [nn];
  for (int ii = 0; ii < nn; ++ii) yy[ii] = y[ii];

  //  0       a ..... a
  //  .       .       .
  //  ncov    a ..... a
  //          1 ... 1 2 3
  //          4 ... 4 5 6 7

  //  cout << nVar << endl;
  //  cout << nCov << " " << nCov+3 << endl;
  //  cout << nn << endl;

  for (int i = 0; i < nn; ++i) {
    double ge = g[i]*vars[i]; // E is first column of vars
    for (int ii = 0; ii < nCov; ++ii) {
      xtx[ii*nVar + nCov+1] += g[i] * vars[ii*nn + i]; // [1]
      xtx[ii*nVar + nCov+2] += ge * vars[ii*nn + i]; // [4]
    }
    xtx[nCov*nVar + nCov+1] += g[i]; // [2]
    xtx[(nCov+1)*nVar + nCov+1] += pow2(g[i]); // [3]
    xtx[nCov*nVar + nCov+2] += ge; // [5]
    xtx[(nCov+1)*nVar + nCov+2] += g[i] * ge; // [6]
    xtx[(nCov+2)*nVar + nCov+2] += pow2(ge); // [7]
    xty[nCov+1] += y[i]*g[i]; // (2)-th
    xty[nCov+2] += y[i]*ge; // (4)-th
    xx[nCov*nn+i] = 1.;
    xx[(nCov+1)*nn+i] = g[i];
    xx[(nCov+2)*nn+i] = ge;
  }

  //  for (int ii = 0; ii < nVar; ++ii) {
  //  for (int jj = 0; jj < nVar; ++jj) {
  //    cout << "  " << xtx[ii*nVar + jj];
  //  }
  //  cout << "\t\t= " << xty[ii] << endl;
  //}


  
  { // deliberately shadowing n and y, probably a very bad idea

    // arguments for dposv and dsymv (some are shared)
    char uplo = 'L';
    int blasInt = 3;
    double* a = new double [nVar2]; 
    int lda = nVar;
    double* b = new double [nVar*3];
    int ldb = nVar;
    int info = 0;
    double alpha = 1.;
    double beta = 0.;
    double* y = new double [nVar];
    
    for (int ii = 0; ii < nVar2; ++ii) a[ii] = xtx[ii]; // copy because destroyed on call
    // use two rhs at same time
    for (int ii = 0; ii < nVar; ++ii) b[ii] = xty[ii]; // copy because destroyed on call
    for (int ii = 0; ii < nVar; ++ii) b[nVar+ii] = 0.;
    for (int ii = 0; ii < nVar; ++ii) b[2*nVar+ii] = 0.;
    b[nVar+nCov+2] = 1.; // elementary vector for GE coefficient
    b[2*nVar+nCov+1] = 1.; // elementary vector for G coefficient

    //cout << "BEFORE CALL b: theta?\n";
    //for (int jj = 0; jj < nVar*3; ++jj) {
      //cout << " " << b[jj];
    //}
    //cout << endl;
    dposv_(&uplo, &nVar, &blasInt, a, &nVar, b, &nVar, &info);
    if (info) return(0);

    //cout << "AFTER CALL b: theta?\n";
      //for (int jj = 0; jj < nVar*3; ++jj) {
      //cout << " " << b[jj];
    //}
    //cout << endl;

    double* theta = new double [nVar]; // regression parameter estimates
    for (int ii = 0; ii < nVar; ++ii) theta[ii] = b[ii]; 

    double* yhat = new double [nn];
    for (int i=0; i<nn; ++i) {
      yhat[i] = 0;
      for (int ii=0; ii<nVar; ++ii) yhat[i]+=xx[ii*nn+i]*b[ii];
    }
    double* d = new double [nn];
    for (int i=0; i<nn; ++i) d[i]=pow2(yy[i]-yhat[i]);
    double* xtdx = new double [nVar2];
    for (int i=0; i<nVar; ++i) {
      for (int ii=0; ii<nVar; ++ii) {
	xtdx[ii*nVar+i] = 0;
	for (int iii=0; iii<nn; ++iii) xtdx[ii*nVar+i]+=xx[i*nn+iii]*xx[ii*nn+iii]*d[iii];
      }
    }
    double* xtdxb = new double [nVar*3];
    for (int i=0; i<nVar; ++i) {
      for (int ii=0; ii<3; ++ii) {
	xtdxb[ii*nVar+i] = 0;
	for (int iii=0; iii<nVar; ++iii) xtdxb[ii*nVar+i]+=xtdx[iii*nVar+i]*b[ii*nVar+iii];
      }
    }
    for (int ii = 0; ii < nVar2; ++ii) a[ii] = xtx[ii];//a destroyed after first call
    dposv_(&uplo, &nVar, &blasInt, a, &nVar, xtdxb, &nVar, &info);
    if (info) return(0);
    
    double v = xtdxb[nVar+nCov+2];  // variance of SNP*E estimate
    double covvw = xtdxb[nVar+nCov+1]; // covariance of SNP estimate and SNP*E estimate
    double w = xtdxb[2*nVar+nCov+1]; // variance of SNP estimate
    *betasnp =theta[nCov+1] ;
    *sesnp = sqrt(w);;
    *pvalsnp = 2.*pt(-fabs(*betasnp)/(*sesnp), df, 1, 0);
    *covsnpinteraction = covvw;
    *betaInt = theta[nCov+2];
    *seInt = sqrt(v);
    *pvalInt = 2.*pt(-fabs(*betaInt)/(*seInt), df, 1, 0);

    delete [] xtdxb;
    delete [] xtdx;
    delete [] d;
    delete [] yhat;
    delete [] theta;
    delete [] y;
    delete [] b;
    delete [] a;
  }

  delete [] yy;
  delete [] xx;
  delete [] xty;
  delete [] xtx;    

  return(1);
}



int method_logistic (vector<int> y, vector<double> g, int n,
		     double* x, int nCov, vector<double>* qnull, bool null,
		     double* beta, double* se, double* pval) {

  char blasTransNo = 'N';
  char blasUploLow = 'L';
  double blasAlpha1 = 1.;
  int blasInt1 = 1;
  double blasBeta0 = 0.;

  int maxNits = 20;

  int nVar = null ? 1+nCov : 2+nCov;

  double* q = new double [nVar];
  // note q is parameter estimate, *not* ==1-p
  if (null) {
    // the initial guess could be more clever...
    for (int ii = 0; ii < nVar; ++ii) {
      q[ii] = 0.;
    }
  } else {
    for (int ii = 0; ii < nVar-1; ++ii) {
      q[ii] = (*qnull)[ii];
    }
    q[nVar-1] = 0.;

    // overwrite the (nCov+2)-th column of x
    for (int i = 0; i < n; ++i) {
      x[(nVar-1)*n + i] = g[i];
    }
  }

  double* nu = new double [n];
  double* p = new double [n];
  double* a = new double [nVar*nVar];
  double* b = new double [nVar];

  double delta = 0.;
  int info;
  int nits = 0;

  do {

    // nu = x %*% q
    dgemv_(&blasTransNo, &n, &nVar, &blasAlpha1, x, &n, q, &blasInt1, &blasBeta0, nu, &blasInt1);
    
    // p  = 1/(1+exp(-nu))
    for (int i = 0; i < n; ++i) {
      p[i] = 1./(1.+exp(-nu[i]));
    }
    
    // a = t(x) %*% diag(p(1-p)) %*% x    // x'x is dsyrk_
    // b = t(x) %*% (y-p)
    for (int ii = 0; ii < nVar; ++ii) {
      for (int jj = 0; jj < nVar; ++jj) {
	a[ii*nVar + jj] = 0.;
      }
      b[ii] = 0.;
    }
    for (int i = 0; i < n; ++i) {
      for (int ii = 0; ii < nVar; ++ii) {
	for (int jj = ii; jj < nVar; ++jj) {
	  a[ii*nVar + jj] += x[n*ii + i] * p[i] * (1.-p[i]) * x[n*jj + i];
	}
	b[ii] += x[n*ii + i] * (y[i] - p[i]);
      }
    } // only need to fill F77-sense lower triangle of this matrix


//     for (int ii = 0; ii < nVar; ++ii) {
//       for (int jj = 0; jj < nVar; ++jj) {
// 	cout << " " << a[ii*nVar + jj];
//       }
//       cout << "       = " << b[ii] << endl;
//     }
//     cout << endl;

    // q = q + solve(a, b)
    dposv_(&blasUploLow, &nVar, &blasInt1, a, &nVar, b, &nVar, &info);
    if (info)  {
      delete [] b;
      delete [] a;
      delete [] p;
      delete [] nu;
      delete [] q;
      return (0);
    }
    delta = 0.;
    for (int ii = 0; ii < nVar; ++ii) {
      q[ii] += b[ii];
      delta += pow2(b[ii]);
    }
    ++nits;

  } while (nits < maxNits && delta > 1e-10);
  // } (theta not changed too much);
  if (nits == maxNits) {
    delete [] b;
    delete [] a;
    delete [] p;
    delete [] nu;
    delete [] q;
    return (0);
  }

  if (null) {
    for (int ii = 0; ii < nVar; ++ii) {
      (*qnull)[ii] = q[ii];
    }
    return (nits);
  }

  *beta = q[nVar-1];

  // std.errors = sqrt(diag(solve(a)))
  // rebuild a because destroyed by last call to dposv
  // build new b
  for (int ii = 0; ii < nVar; ++ii) {
    for (int jj = 0; jj < nVar; ++jj) {
      a[ii*nVar + jj] = 0.;
    }
    b[ii] = 0.;
  }
  for (int i = 0; i < n; ++i) {
    for (int ii = 0; ii < nVar; ++ii) {
      for (int jj = 0; jj < nVar; ++jj) {
	a[ii*nVar + jj] += x[n*ii + i] * p[i] * (1.-p[i]) * x[n*jj + i];
      }
    }
  } // should only fill F77-sense lower triangle of this matrix
  b[nVar-1] = 1.;
  dposv_(&blasUploLow, &nVar, &blasInt1, a, &nVar, b, &nVar, &info);
  if (info) {
    delete [] b;
    delete [] a;
    delete [] p;
    delete [] nu;
    delete [] q;
    return (0);
  }
  *se = sqrt(b[nVar-1]);
  *pval = pchisq(pow2(*beta)/b[nVar-1], 1, 0, 0); // lower_tail = 0, give_log = 0;

  delete [] b;
  delete [] a;
  delete [] p;
  delete [] nu;
  delete [] q;

  return(nits);
}
  
    

// define x as a n*(ncov+3) array
// first n columns are covariates
//   n+1th column is all ones
//   n+2th column is genotype
//   n+3th column is genotype*E, if needed

// define a as (ncov+3)*(ncov+3) array
//   populate first (ncov+1)*(ncov+1) elements as covariate covariance matrix






int normal_residuals (double* yOut, double* yIn, int n, 
 		      double* vars, double* vars2, double* varty, int nUse, int nVar, double* ssr) {

  char blasUploLow = 'L';
  char blasTransNo = 'N';
  int blasInt1 = 1;
  double blasAlphaNegOne = -1.;
  double blasBetaOne = 1.;

  // we make nVar*nVar covariance matrix vars2,
  // but then only copy the nUse*nUse top left block into a
  // note must make copy because destroyed by call to dposv_

  double* a = new double [nUse*nUse];
  double* b = new double [nUse];
  
  for (int ii = 0; ii < nVar; ++ii) {
    for (int jj = 0; jj < nVar; ++jj) { // initialise nVar*nVar
      vars2[ii*nVar + jj] = 0.;
    }
    varty[ii] = 0.;
  }
  for (int i = 0; i < n; ++i) { // fill in nUse*nUse
    for (int ii = 0; ii < nUse; ++ii) {
      for (int jj = 0; jj < nUse; ++jj) {
	vars2[ii*nVar + jj] += vars[n*ii + i] * vars[n*jj + i];
      }
      varty[ii] += vars[n*ii + i] * yIn[i];
    }
    yOut[i] = yIn[i];
  } // should only fill F77-sense lower triangle of this matrix
  for (int ii = 0; ii < nUse; ++ii) {
    for (int jj = 0; jj < nUse; ++jj) {
      a[ii*nUse + jj] = vars2[ii*nVar + jj];
    }
    b[ii] = varty[ii];
  }

  int info = 0;
  dposv_(&blasUploLow, &nUse, &blasInt1, a, &nUse, b, &nUse, &info);
  if (info) {
    delete [] b;
    delete [] a;
    return (0);
  }

  dgemv_(&blasTransNo, &n, &nUse, &blasAlphaNegOne, vars, &n, b, &blasInt1, &blasBetaOne, yOut, &blasInt1);
  *ssr = 0.;
  for (int i = 0; i < n; ++i) {
    *ssr += pow2(yOut[i]);
  }

  delete [] b;
  delete [] a;
  return (1);
}






void print_summary (double* x, int n) {

  double minx = 0.;
  double maxx = 0.;
  double sumx = 0.;
  double sumxx = 0.;

  for (int i = 0; i < n; ++i) {
    sumx += x[i];
    sumxx += pow2(x[i]);
    if (i == 0) {
      minx = x[0];
      maxx = x[0];
    } else {
      minx = x[i] < minx ? x[i] : minx;
      maxx = x[i] > maxx ? x[i] : maxx;
    }
  }

  double meanx = sumx/n;
  cout << "has mean " << meanx;
  cout << " s.d. " << sqrt((sumxx - pow2(sumx)/n)/n);
  cout << " and range " << minx << " to " << maxx << endl;
}

void print_summary (vector<double> x, int n) {

  double minx = 0.;
  double maxx = 0.;
  double sumx = 0.;
  double sumxx = 0.;

  for (int i = 0; i < n; ++i) {
    sumx += x[i];
    sumxx += pow2(x[i]);
    if (i == 0) {
      minx = x[0];
      maxx = x[0];
    } else {
      minx = x[i] < minx ? x[i] : minx;
      maxx = x[i] > maxx ? x[i] : maxx;
    }
  }

  double meanx = sumx/n;
  cout << "has mean " << meanx;
  cout << " s.d. " << sqrt((sumxx - pow2(sumx)/n)/n);
  cout << " and range " << minx << " to " << maxx << endl;
} 
