# This source code including all necessary steps for the paper:
# Deng et al., "Spatial scaling of forest soil microbial communities along broad temperature gradient"
# Please read the paper first, and then use this code for your calculation
# Please cite our paper if you find this code useful. 


library("SPECIES")
library("gtools")
library("vegan")

rm(list=ls(all=TRUE))

setwd("Default directory")

### read original OTU table rather than resampled OTU table ####
OTU_dat=read.table("16S_97sim.txt",sep="\t",header=T,row.names=1)
OTU.dat = t(OTU_dat)
OTU.dat[is.na(OTU.dat)] = 0

# for group #
# Our dataset included 6 sampling sites, and each one had 21 nested designed samples. 
# The following code is to group the samples in each site together. It should be revised according to your sampling design and sample names. 
samp = rownames(OTU.dat)
grp1 = mixedsort(samp[grep("B",samp)]); grp1 = c(grp1[length(grp1)], grp1[1:(length(grp1)-1)])
grp2 = mixedsort(samp[grep("L",samp)]); grp2 = c(grp2[length(grp2)], grp2[1:(length(grp2)-1)])
grp3 = mixedsort(samp[grep("^C",samp)]); grp3 = c(grp3[length(grp3)], grp3[1:(length(grp3)-1)])
grp4 = mixedsort(samp[grep("A",samp)]); grp4 = c(grp4[length(grp4)], grp4[1:(length(grp4)-1)])
grp5 = mixedsort(samp[grep("H",samp)]); grp5 = c(grp5[length(grp5)], grp5[1:(length(grp5)-1)])
grp6 = mixedsort(samp[grep("^N",samp)]); grp6 = c(grp6[length(grp6)], grp6[1:(length(grp6)-1)])
grp = list(grp1, grp2, grp3, grp4, grp5, grp6)
names(grp) = order1 = c("BCI", "LUQ", "CWT", "AND", "HFR",  "NWT")
grplist    = c(rep("BCI",21), rep("LUQ",21), rep("CWT",21), rep("AND",21), rep("HFR",21),  rep("NWT",21))
grp.order  = c(grp1, grp2, grp3, grp4, grp5, grp6)

# Distance
distance = c(2, 20, 100, 200, 400)
area = distance * distance/2

#=====================================
#--Chao1 based theoretical OTU number----
#=====================================

z.chao1 = logc.chao1  = p.chao1 = c()
chao1.samp = c()
richness.chao1  = c()
for(i in 1:6){
  commu = OTU.dat[grp[[i]],]
  chao1.rich = ace.rich = c()
  est.rich = estimateR(commu)
  chao1.samp = c(chao1.samp, est.rich[2,])
  for (j in 1:5){
    sub_commu = commu[1:(j*4+1),]
    pooled.commu = colSums(sub_commu)
    est.rich = estimateR(pooled.commu)
    chao1.rich = c(chao1.rich, est.rich[2])
  }
  richness.chao1 = cbind(richness.chao1, chao1.rich)
  
  lm1 = lm(log10(chao1.rich)~log10(area))
  z.chao1 = c(z.chao1, as.numeric(lm1$coefficients[2]))
  logc.chao1 = c(logc.chao1, as.numeric(lm1$coefficients[1]))
  
  z.chao1.null =  c()
  for (aa in 1:1000){
    chao1.null = sample(chao1.rich, replace=T)
    lm1 = lm(log10(chao1.null)~log10(area))
    z.chao1.null = c(z.chao1.null, as.numeric(lm1$coefficients[2]))
  }
  ttest1 = t.test(z.chao1.null, mu = z.chao1[i])
  p.chao1 = c(p.chao1, ttest1$p.value)
}
p.chao1

#--plot for Fig. 1B -----
library("ggplot2")

#---linear on Tax-area relationship----
richness = richness.chao1 #ace #
dat.mean.lg = log10(richness)
area.lg = log10(area)
colnames(dat.mean.lg) = names(grp)

dat.4plot = c()
for(i in 1:ncol(dat.mean.lg)){
  dat = cbind(area.lg, dat.mean.lg[,i])
  dat.4plot = rbind(dat.4plot, dat)
}
dat.4plot = as.data.frame(dat.4plot)
colnames(dat.4plot) = c("area.lg", "lg.rich")
site.list = rep(names(grp), each = 5)

g1 = ggplot(dat.4plot, aes(x=area.lg, y=lg.rich, color=site.list, size=3))+
  geom_point()+
  geom_smooth(method="lm", size=1,fill=NA,fullrange=TRUE)+ 
  xlab("log(area)") +
  ylab("log(species richness)")+
  scale_x_continuous(expand=c(0,0), limits=c(0,5.2)) +
  theme_bw()
print(g1+theme(axis.title=element_text(size=24),axis.text=element_text(size=20),legend.text=element_text(size=20),legend.key=element_rect(fill=NA),panel.grid.major = element_blank(), panel.grid.minor = element_blank()))


#================================================================
#---Observed empicical OTUs (Through 100 times of resampling)-------
#================================================================
richness.obs = list()
z.obs = c()
R2.obs = c()
n.obs  = c()
logc.obs = c()
p.obs = c()
H.obs = c()
D.obs = c()
J.obs = c()
OTU.num.obs = c()

### Random sampling 100 times ###
### Here needs quite long time, so we prefer to save the results as a R image and load it for following calculation###

OTU.list = colnames(OTU.dat)
reads.num = rowSums(OTU.dat)
resample.size = min(reads.num)
for(a in 1:100){
  OTU.dat.resample = OTU.dat
  for(i in 1:nrow(OTU.dat)){ #resample each sample
    OTU.dat.vector = rep(OTU.list, OTU.dat[i,])
    resample = sample(OTU.dat.vector, size = resample.size)
    freq = table(resample)
    OTU.dat.resample[i,] = 0
    OTU.dat.resample[i, names(freq)] = freq
  }
  
  #--remove empty columns---
  colsums = colSums(OTU.dat.resample)
  OTU.dat.resample = OTU.dat.resample[,which(colsums>0)]
  OTU.num = rowSums(OTU.dat.resample>0)
  n.obs = rbind(n.obs, tapply(OTU.num, grplist, mean))
  
  #--calculate Diversity---
  H = diversity(OTU.dat.resample)
  D = diversity(OTU.dat.resample, "simpson")
  J = H/log(OTU.num) # enveness
  H.obs = rbind(H.obs, H)
  D.obs = rbind(D.obs, D)
  J.obs = rbind(J.obs, J)
  OTU.num.obs = rbind(OTU.num.obs, OTU.num)
  
  #--calculate z values---
  richness.obs.site = c()
  z.obs.site = logc.obs.site = c()
  R2.obs.site  = c()
  for(i in 1:length(grp)){ # for each site
    commu = OTU.dat.resample[grp[[i]],]
    rich.obs = c()
    for (j in 1:5){
      sub_commu = commu[1:(j*4+1),]
      # count species richness
      col.count = colSums(sub_commu >0 )
      sp.rich.obs = length(which(col.count>0))
      rich.obs = c(rich.obs, sp.rich.obs)
      
    }
    lm1 = lm(log10(rich.obs)~log10(area))
    z.obs.site = c(z.obs.site, as.numeric(lm1$coefficients[2]))
    logc.obs.site = c(logc.obs.site, as.numeric(lm1$coefficients[1]))
    
    R2.obs.site = c(R2.obs.site, (cor(log10(rich.obs),log10(area)))^2)
    
    richness.obs.site = cbind(richness.obs.site, rich.obs)
    
  }
  colnames(richness.obs.site) = names(grp)
  richness.obs[[a]] = richness.obs.site
  
  z.obs = rbind(z.obs, z.obs.site)
  logc.obs = rbind(logc.obs, logc.obs.site)
  R2.obs = rbind(R2.obs, R2.obs.site)
}

colnames(z.obs) = colnames(logc.obs) = names(grp)
#richness.obs
cbind(colMeans(n.obs[,order1]), 10^colMeans(logc.obs), colMeans(z.obs))
cbind(colMeans(OTU.num.obs), colMeans(H.obs), colMeans(J.obs), colMeans(D.obs))
cbind(apply(OTU.num.obs, 2, sd), apply(OTU.num.obs, 2, sd), apply(H.obs, 2, sd), apply(J.obs, 2, sd), apply(D.obs, 2, sd))

save.image("16S_97sim.RData")

#-----------
# for plot Fig. 1A
#-----------
richness = richness.obs  #richness.ace1 #
z = colMeans(z.obs) #z.chao1)  #z.ace1)  #
z.err = apply(z.obs, 2, sd) #z.chao1  #z.ace1

library("ggplot2")

#---linear on Tax-area relationship----
#---modified for permutation
richness = richness.obs #richness.chao1 #
dat.mix = array(0, dim=c(5,6,100))
for(a in 1:100){
  dat.mix[,,a] = richness[[a]]
}
dat.mean = dat.max = dat.min = matrix(0, ncol=6, nrow=5)
for(x in 1:5){
  for(y in 1:6){
    dat.mean[x, y] = mean(dat.mix[x,y,])
    dat.max[x, y] = max(dat.mix[x,y,])
    dat.min[x, y] = min(dat.mix[x,y,])
  }
}
dat.mean.lg = log10(dat.mean)
dat.max.lg = log10(dat.max)
dat.min.lg = log10(dat.min)
area.lg = log10(area)
colnames(dat.mean.lg) = names(grp)

dat.4plot = c()
for(i in 1:ncol(dat.mean.lg)){
  dat = cbind(area.lg, dat.mean.lg[,i], dat.min.lg[,i], dat.max.lg[,i])
  dat.4plot = rbind(dat.4plot, dat)
}
dat.4plot = as.data.frame(dat.4plot)
colnames(dat.4plot) = c("area.lg", "lg.rich", "lg.rich.min", "lg.rich.max")
site.list = rep(names(grp), each = 5)

g1 = ggplot(dat.4plot, aes(x=area.lg, y=lg.rich, color=site.list))+
  geom_point()+
  geom_errorbar(aes(x=area.lg,ymin=lg.rich.min, ymax=lg.rich.max,color=site.list, width=0.2))+
  stat_smooth(method="lm",fullrange=TRUE,fill=NA,size=1)+ #
  xlab("log(area)") +
  ylab("log(species richness)") +
  scale_x_continuous(expand=c(0,0), limits=c(0,5.2)) +
  theme_bw()
print(g1+theme(axis.title=element_text(size=24),axis.text=element_text(size=20),legend.text=element_text(size=20),legend.key=element_rect(fill=NA),panel.grid.major = element_blank(), panel.grid.minor = element_blank()))

z.perm.site = c()
intercept.site = c()
for(i in 1:ncol(dat.mean.lg)){
  lm1 = lm(dat.mean.lg[,i]~area.lg)
  z.perm.site = c(z.perm.site, lm1$coefficients[2])
  intercept.site = c(intercept.site, lm1$coefficients[1])
}
names(intercept.site) = colnames(dat.mean.lg)
intercept.site
cor.test(intercept.site, temp )
c1 = mean(intercept.site)


#================================
#---Fit different z model-------
#================================
library(mmSAR)
mods = c("power","expo","negexpo","monod","logist","ratio","lomolino","weibull")
data(power);data(expo);data(negexpo);data(monod);data(ratio);data(logist);data(lomolino);data(weibull)
z = colMeans(z.obs)
for(i in 1:ncol(dat.mean)){
  print(names(grp)[i])
  lm1 = lm(log10(dat.mean[,i]) ~ log10(area)) #AIC of log-log
  aic = AIC(lm1)
  print(aic)
  dat4model = list(a = area, s = dat.mean[,i])
  dat4SAR = list(data = data.frame(dat4model), name=names(grp)[i])
  res = multiSAR(modelList=mods, dat4SAR)
  print(res$optimRes)
}


#======================================
#---Plot z values with temperature-----
#======================================

temp = c(25, 23, 16, 8, 6, 2) # Input the temperatures here. The current data are not accurate temperatures in our sites (just for example)  
  
site.temp = temp
names(site.temp) = c("BCI", "Luquillo", "Coweeta", "HJA", "Harvard",  "Niwot")

z = colMeans(z.obs)
z.err = apply(z.obs, 2, sd) 

r = cor(z,site.temp)
myline.fit = lm(z ~ site.temp)
slope = myline.fit$coefficients[2]

dat4plot = cbind(site.temp, z, z.err)

##----- plot for linear and non-linear trendline (Fig. 3)
ggplot(data=as.data.frame(dat4plot), aes(x=site.temp, y=z))+
  geom_point(aes(color=factor(1:6), size =3))+
  geom_errorbar(aes(x=site.soil.temp,ymin=z-z.err, ymax=z+z.err,color=factor(1:6)))+
  geom_smooth(method="lm", se=FALSE, color="blue", formula=y~x, size=1.1)+
  geom_smooth(method="lm", se=FALSE, color="red", formula=y~log(x), size=1.1)+
  #geom_smooth(method="nls", se=FALSE, color="black", formula='y~a*x^b', start=list(a=0.05, b=0.05), size=1.1)+
  ggtitle(bquote(R^2~"="~.(round(r*r,digits=3))~", slope="~.(signif(slope,digits=3))))+
  xlab("Anual average temperature") +
  ylab("z value")


#==================================
#-------Correlation test-----------
#==================================

env.site = read.table("Site_env.txt", sep="\t", header=T, row.names=1)
env.variable = env.site[,c(1:15)] # Pick up some important env variables
env.variable = scale(env.variable) # Standardize the environmental variables

rep2 = c()
for(i in 1:ncol(env.variable)){
  cor = cor.test(z, env.variable[,i])
  rep2 = rbind(rep2, c(cor$estimate, cor$p.value))
}
rownames(rep2) = names(env.variable)
rep2


#================================
#------Plant z value plots-------
#================================

plant.z = c(0.3007,  0.311,	0.287,	0.222,	0.166,	0.09) # plant z values were calculated by Enquist's lab
cor.test(plant.z, site.temp)
r = cor(plant.z,site.temp)
myline.fit = lm(plant.z ~ site.temp)
slope = myline.fit$coefficients[2]


dat4plot = cbind(plant.z, z, z.err)
r = cor(plant.z,z)
cor.test(plant.z, z)
ggplot(data=as.data.frame(dat4plot), aes(x=plant.z, y=z))+
  geom_point(aes(color=factor(1:6), size =3))+
  geom_errorbar(aes(x=plant.z,ymin=z-z.err, ymax=z+z.err,color=factor(1:6)))+
  geom_smooth(method="lm", se=FALSE, color="blue", formula=y~x)+
  ggtitle(bquote(R^2~"="~.(round(r*r,digits=3))))+
  xlab("Plant z value") +
  ylab("Microbial z value")



#========================================
#------Multiple regression tree-----
#========================================

library("mvpart")

OTU.dat = read.table("OTU dataset after resampling.txt",sep="\t",header=T,row.names=1)

env.sample = read.table("env variables by samples.txt",sep="\t",header=T,row.names=1)
 
OTU.dat = t(OTU.dat)
dim(OTU.dat)
colnames(env.sample)

mrt_re3 = mvpart(data.matrix(geochip)~TN+TC+PH+MAT_Sean+Plant_rich,group, xv="1se", xval=50, xvmult=10)
summary(mrt_re3)


#=====================================
#---- Modeling the species richness---
#=====================================

library("ggplot2")

setwd("E:/backup-8.21.15/my work in OU/Lina_Metabolic theory/analysis/modeling")
rm(list=ls(all=TRUE))

temp.max = 30
temp.min = 2

# All below formula were obtained through above Fig. 3
z.max1 = 0.0688 + 0.000584*temp.max 
z.min1 = 0.0688 + 0.000584*temp.min

z.max2 = 0.0619 * temp.max^0.0894
z.min2 = 0.0619 * temp.min^0.0894

z.max3 = 0.0067*log(temp.max) + 0.0609
z.min3 = 0.0067*log(temp.min) + 0.0609

z.max = max(z.max1, z.max2, z.max3)
z.min = min(z.min1, z.min2, z.min3)


c1 = 10^4.352  # obtained from the average intercept of 6 sites

samp1 = seq(0,12)
area  = c(10^samp1, 3.28*10^12)

sp.rich.max = c1*(area^z.max)
sp.rich.min = c1*(area^z.min)

sp.rich.max1 = c1*(area^z.max1)
sp.rich.min1 = c1*(area^z.min1)
sp.rich.max2 = c1*(area^z.max2)
sp.rich.min2 = c1*(area^z.min2)
sp.rich.max3 = c1*(area^z.max3)
sp.rich.min3 = c1*(area^z.min3)


dat4plot = cbind(area, sp.rich.max, sp.rich.min, sp.rich.max1, sp.rich.min1, sp.rich.max2, sp.rich.min2, sp.rich.max3, sp.rich.min3)
number_ticks <- function(n) {function(limits) pretty(limits, n)}
ggplot(as.data.frame(dat4plot), aes(x=area, y=sp.rich.max1/1000)) + 
  geom_line(colour="blue") + 
  geom_line(aes(x=area, y=sp.rich.min1/1000), color="blue") + 
  geom_line(aes(x=area, y=sp.rich.max2/1000), color="yellow") + 
  geom_line(aes(x=area, y=sp.rich.min2/1000), color="yellow") + 
  geom_line(aes(x=area, y=sp.rich.max3/1000), color="red") + 
  geom_line(aes(x=area, y=sp.rich.min3/1000), color="red") + 
  
  geom_vline(xintercept = max(area), linetype="dotted") + 
  geom_ribbon(aes(x=area, ymin=sp.rich.min1/1000, ymax = sp.rich.max1/1000), fill="blue", alpha=0.2) + 
  geom_ribbon(aes(x=area, ymin=sp.rich.min2/1000, ymax = sp.rich.max2/1000), fill="yellow", alpha=0.2) + 
  geom_ribbon(aes(x=area, ymin=sp.rich.min3/1000, ymax = sp.rich.max3/1000), fill="red", alpha=0.2) + 
  scale_x_log10(breaks=area[-length(area)])+
  scale_y_log10(breaks=c(1, 50,100,150, 200, 250, 300, 350))+  #breaks=number_ticks(5)
  xlab("Area (m^2)")+
  ylab("OTU richness (x1000)")+
  theme(axis.text.x = element_text(angle=90), axis.text=element_text(size=16), axis.title=element_text(size=20, face="bold"))