import numpy as np from read_gene_expression import X,GID,STP,SID,UC,CD ## Convert X to log scale XL = np.log(X) / np.log(2) ## Z-test statistics MUC = XL[:,UC].mean(1) ## Mean of ulcerative colitis samples MCD = XL[:,CD].mean(1) ## Mean of Crohn's disease samples VUC = XL[:,UC].var(1) ## Variance of ulcerative colitis samples VCD = XL[:,CD].var(1) ## Variance of Crohn's disease samples nUC = len(UC) ## Number of ulcerative colitis samples nCD = len(CD) ## Number of Crohn's disease samples Z = (MUC - MCD) / np.sqrt(VUC/nUC + VCD/nCD) ## Split the sample in half based on marginal standard deviation of ## gene expression SD = X.std(1) isdl = np.flatnonzero(SD < np.median(SD)) isdh = np.flatnonzero(SD > np.median(SD)) ## Split the sample in half based on the marginal mean gene expression SD = X.mean(1) imnl = np.flatnonzero(SD < np.median(SD)) imnh = np.flatnonzero(SD > np.median(SD)) ## Gaussian distribution CDF, pdf, quantile function, etc. from scipy.stats.distributions import norm ## The Z-score threshold under a Bonferroni correction zst = -norm.ppf(0.025/Z.shape[0]) ## Write out some information about the genes that meet this condition ## to a file ii = np.flatnonzero(np.abs(Z) > zst) with open("bonferroni_genes.csv", "w") as OUT: for i in ii: OUT.write("%.4f,%s\n" % (Z[i], GID[i]))