Data

I’ve decided to create expression data of 6 genes. GeneA, GeneB, GeneC and genes that correspond to some Silencer, Enhancer and Inducer of some of their transcription. Genes themselves don’t carry any meaning - this data serves for practicing purposes.

# Data preparation
N <- 1000

GeneSilencer = rnorm(N, 6, 2) # Expression of a gene coding a silencing killer mRNA
GeneEnhancer = rnorm(N, 8, 3) # Expression of a gene coding a protein enhancing gene expression
GeneInducer = rnorm(N, 2, 1) # Expression of a gene coding a protein inducing gene expression of GeneC

GeneA = ifelse(GeneEnhancer > GeneSilencer, # Try mixture distribution
               rnorm(N, 5, 2),  # Normal distribution if GeneEnhancer is higher
               rexp(N, 1))      # Exponential distribution otherwise

GeneC = ifelse(GeneInducer > 3, # Expression of a gene dependent on an inducer
               rnorm(N, 2, 0.5),
               rexp(N, 1))

a=8
enh_coef=0.5
gena_coef=0.01
genc_coef=0.4
sil_coef=-0.3
enhsil_coef=-0.1
muB <- a + enh_coef*GeneEnhancer + sil_coef*GeneSilencer + gena_coef*GeneA + genc_coef*GeneC + enhsil_coef*GeneEnhancer*GeneSilencer
s=2
sigmaB <- s - 0.75*s * sigmoid(GeneC) # GeneC refines the value of expression - makes it less disperse
GeneB <- rnorm(N, muB, sigmaB) # Expression of a gene is positively influenced by the expression of GeneC and GeneA and non-linearly influenced by enhancer and silencer

expression_data <- list(
  GeneEnhancer = GeneEnhancer,
  GeneSilencer = GeneSilencer,
  GeneInducer = GeneInducer,
  GeneA = GeneA,
  GeneB = GeneB,
  GeneC = GeneC
)

summary(expression_data)
##              Length Class  Mode   
## GeneEnhancer 1000   -none- numeric
## GeneSilencer 1000   -none- numeric
## GeneInducer  1000   -none- numeric
## GeneA        1000   -none- numeric
## GeneB        1000   -none- numeric
## GeneC        1000   -none- numeric

Let’s see how some of the relationships turned out.

Firstly between Genes A, B and C.

library(ggplot2)
data <- data.frame(GeneA, GeneB, GeneC)

p1 <- ggplot(data, aes(x = GeneA, y = GeneB)) +
  geom_point(alpha = 0.5) +
  labs(title = "GeneA vs GeneB", x = "GeneA", y = "GeneB")

p2 <- ggplot(data, aes(x = GeneA, y = GeneC)) +
  geom_point(alpha = 0.5) +
  labs(title = "GeneA vs GeneC", x = "GeneA", y = "GeneC")

p3 <- ggplot(data, aes(x = GeneB, y = GeneC)) +
  geom_point(alpha = 0.5) +
  labs(title = "GeneB vs GeneC", x = "GeneB", y = "GeneC")

# Combine plots
library(gridExtra)
grid.arrange(p1, p2, p3, nrow = 1)

Let’s see how Silencer influences GeneA and GeneB

data <- data.frame(GeneA, GeneC, GeneSilencer)

p1 <- ggplot(data, aes(x = GeneSilencer, y = GeneA)) +
  geom_point(alpha = 0.5) +
  labs(title = "Influence of Silencer on GeneA", x = "GeneSilencer", y = "GeneA")

p2 <- ggplot(data, aes(x = GeneSilencer, y = GeneC)) +
  geom_point(alpha = 0.5) +
  labs(title = "Influence of Silencer on GeneC", x = "GeneSilencer", y = "GeneC")

grid.arrange(p1, p2, nrow = 1)

Model

Using informed priors, a scientist tries to decode what is actually happening for GeneA and GeneB with the information of expressions of GeneC, Enhancer, Inducer, Silencer. A wise thing to do beforehand would be to remove negative values from the data since these can be considered as errors of the measurement - the model here just replicates the behaviour of the measurement machine.

model <- alist(
  # likelihood
  GeneA ~ dnorm(muA, sigmaA),
  GeneB ~ dnorm(muB, sigmaB),
  muA <- a1 + b1E*GeneEnhancer + b12*GeneEnhancer*GeneSilencer             - b1S*GeneSilencer,
  muB <- a2 + b2E*GeneEnhancer + b22*GeneC + b23*GeneEnhancer*GeneSilencer - b2S*GeneSilencer,
  sigmaA <- s11 + s12 * GeneC,
  sigmaB <- s21 + s22 * GeneC,
  # priors
  c(a1, a2) ~ dnorm(0, 10),
  c(b12, b22, b23) ~ dnorm(0,2),
  c(b1E, b2E, b1S, b2S) ~ dexp(1), # knowledge of silencing and enhancing
  c(s12, s22) ~ dnorm(0,1), # allow heteroscedasticity dependent on GeneC
  c(s11, s21) ~ dexp(1)
)

Make predictions with the model using ulam and extract posterior.

## Running MCMC with 4 parallel chains, with 1 thread(s) per chain...
## 
## Chain 1 Iteration:   1 / 1000 [  0%]  (Warmup) 
## Chain 2 Iteration:   1 / 1000 [  0%]  (Warmup) 
## Chain 3 Iteration:   1 / 1000 [  0%]  (Warmup) 
## Chain 4 Iteration:   1 / 1000 [  0%]  (Warmup) 
## Chain 4 Iteration: 100 / 1000 [ 10%]  (Warmup) 
## Chain 2 Iteration: 100 / 1000 [ 10%]  (Warmup) 
## Chain 1 Iteration: 100 / 1000 [ 10%]  (Warmup) 
## Chain 3 Iteration: 100 / 1000 [ 10%]  (Warmup) 
## Chain 2 Iteration: 200 / 1000 [ 20%]  (Warmup) 
## Chain 1 Iteration: 200 / 1000 [ 20%]  (Warmup) 
## Chain 4 Iteration: 200 / 1000 [ 20%]  (Warmup) 
## Chain 3 Iteration: 200 / 1000 [ 20%]  (Warmup) 
## Chain 2 Iteration: 300 / 1000 [ 30%]  (Warmup) 
## Chain 1 Iteration: 300 / 1000 [ 30%]  (Warmup) 
## Chain 4 Iteration: 300 / 1000 [ 30%]  (Warmup) 
## Chain 3 Iteration: 300 / 1000 [ 30%]  (Warmup) 
## Chain 2 Iteration: 400 / 1000 [ 40%]  (Warmup) 
## Chain 1 Iteration: 400 / 1000 [ 40%]  (Warmup) 
## Chain 3 Iteration: 400 / 1000 [ 40%]  (Warmup) 
## Chain 4 Iteration: 400 / 1000 [ 40%]  (Warmup) 
## Chain 1 Iteration: 500 / 1000 [ 50%]  (Warmup) 
## Chain 1 Iteration: 501 / 1000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 500 / 1000 [ 50%]  (Warmup) 
## Chain 2 Iteration: 501 / 1000 [ 50%]  (Sampling) 
## Chain 3 Iteration: 500 / 1000 [ 50%]  (Warmup) 
## Chain 3 Iteration: 501 / 1000 [ 50%]  (Sampling) 
## Chain 4 Iteration: 500 / 1000 [ 50%]  (Warmup) 
## Chain 4 Iteration: 501 / 1000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 600 / 1000 [ 60%]  (Sampling) 
## Chain 1 Iteration: 700 / 1000 [ 70%]  (Sampling) 
## Chain 3 Iteration: 600 / 1000 [ 60%]  (Sampling) 
## Chain 2 Iteration: 600 / 1000 [ 60%]  (Sampling) 
## Chain 4 Iteration: 600 / 1000 [ 60%]  (Sampling) 
## Chain 1 Iteration: 800 / 1000 [ 80%]  (Sampling) 
## Chain 3 Iteration: 700 / 1000 [ 70%]  (Sampling) 
## Chain 1 Iteration: 900 / 1000 [ 90%]  (Sampling) 
## Chain 3 Iteration: 800 / 1000 [ 80%]  (Sampling) 
## Chain 4 Iteration: 700 / 1000 [ 70%]  (Sampling) 
## Chain 2 Iteration: 700 / 1000 [ 70%]  (Sampling) 
## Chain 1 Iteration: 1000 / 1000 [100%]  (Sampling) 
## Chain 1 finished in 328.3 seconds.
## Chain 3 Iteration: 900 / 1000 [ 90%]  (Sampling) 
## Chain 3 Iteration: 1000 / 1000 [100%]  (Sampling) 
## Chain 3 finished in 349.6 seconds.
## Chain 4 Iteration: 800 / 1000 [ 80%]  (Sampling) 
## Chain 2 Iteration: 800 / 1000 [ 80%]  (Sampling) 
## Chain 4 Iteration: 900 / 1000 [ 90%]  (Sampling) 
## Chain 2 Iteration: 900 / 1000 [ 90%]  (Sampling) 
## Chain 4 Iteration: 1000 / 1000 [100%]  (Sampling) 
## Chain 4 finished in 403.6 seconds.
## Chain 2 Iteration: 1000 / 1000 [100%]  (Sampling) 
## Chain 2 finished in 429.8 seconds.
## 
## All 4 chains finished successfully.
## Mean chain execution time: 377.8 seconds.
## Total execution time: 430.1 seconds.

View posterior values and verify if they make sense.

precis(posterior)
##            mean          sd        5.5%       94.5%     rhat ess_bulk
## a2   8.00496114 0.223352909  7.63567775  8.34737520 1.010539 419.4611
## a1   5.29055916 0.444710672  4.52325570  5.95354245 1.021468 313.0663
## b23 -0.10093989 0.004169533 -0.10746327 -0.09448202 1.008200 462.5344
## b22  0.40004541 0.011155996  0.38373618  0.41718255 1.004962 694.2913
## b12  0.05622006 0.008415704  0.04215765  0.06840110 1.016757 302.1107
## b2S  0.30420561 0.035004418  0.24789267  0.35791072 1.009633 436.0372
## b1S  0.79185618 0.073441088  0.67181489  0.90444799 1.018474 329.5657
## b2E  0.50910616 0.025738379  0.47001178  0.55152620 1.009032 450.8944
## b1E  0.07911460 0.050147333  0.01036833  0.16660174 1.018371 283.2081
## s22 -0.14666910 0.007472548 -0.15640127 -0.13281771 1.002959 419.5657
## s12  0.01805547 0.043950909 -0.05334251  0.08750943 1.005506 754.9652
## s21  1.05706404 0.026820226  1.01513295  1.10004825 1.002702 571.3311
## s11  2.02468946 0.066148000  1.92095340  2.12860535 1.006806 706.6779

Visualize the algorithm’s traceplots - of chains trying to estimate values of the model.

traceplot(posterior)

Dataset

Create and save a dataset using generated data and sampled data from the model.

I’ve decided to extract samples from the model for GeneA and GeneB and create a new GeneD which is a non-linear combination of GeneA and GeneB sampled from the model.

preds=link(posterior, expression_data)
mean_GeneA=apply(preds$muA, 2, mean)
mean_GeneB=apply(preds$muB, 2, mean)

a=0
b=0.25
c=0.4
d=0.1
muD <- a + b*mean_GeneA + c*mean_GeneB + d*mean_GeneA*mean_GeneB
GeneD = rnorm(N, muD, 1)

df_gene_network <- data.frame(GeneA=expression_data$GeneA,
                              GeneB=expression_data$GeneB,
                              GeneC=expression_data$GeneC,
                              GeneD=GeneD,
                              GeneEnhancer=expression_data$GeneEnhancer,
                              GeneSilencer=expression_data$GeneSilencer,
                              GeneInducer=expression_data$GeneInducer)

#write.csv(df_gene_network, "gene_network_expression.csv")