import numpy as np
from read_gene_expression import X,GID,STP,SID,UC,CD
## Convert X to log scale
XL = np.log(X) / np.log(2)
## Z-test statistics
MUC = XL[:,UC].mean(1) ## Mean of ulcerative colitis samples
MCD = XL[:,CD].mean(1) ## Mean of Crohn's disease samples
VUC = XL[:,UC].var(1) ## Variance of ulcerative colitis samples
VCD = XL[:,CD].var(1) ## Variance of Crohn's disease samples
nUC = len(UC) ## Number of ulcerative colitis samples
nCD = len(CD) ## Number of Crohn's disease samples
Z = (MUC - MCD) / np.sqrt(VUC/nUC + VCD/nCD)
## Split the sample in half based on marginal standard deviation of
## gene expression
SD = X.std(1)
isdl = np.flatnonzero(SD < np.median(SD))
isdh = np.flatnonzero(SD > np.median(SD))
## Split the sample in half based on the marginal mean gene expression
SD = X.mean(1)
imnl = np.flatnonzero(SD < np.median(SD))
imnh = np.flatnonzero(SD > np.median(SD))
## Gaussian distribution CDF, pdf, quantile function, etc.
from scipy.stats.distributions import norm
## The Z-score threshold under a Bonferroni correction
zst = -norm.ppf(0.025/Z.shape[0])
## Write out some information about the genes that meet this condition
## to a file
ii = np.flatnonzero(np.abs(Z) > zst)
with open("bonferroni_genes.csv", "w") as OUT:
for i in ii:
OUT.write("%.4f,%s\n" % (Z[i], GID[i]))