# This source code including all necessary steps for the paper: # Deng et al., "Spatial scaling of forest soil microbial communities along broad temperature gradient" # Please read the paper first, and then use this code for your calculation # Please cite our paper if you find this code useful. library("SPECIES") library("gtools") library("vegan") rm(list=ls(all=TRUE)) setwd("Default directory") ### read original OTU table rather than resampled OTU table #### OTU_dat=read.table("16S_97sim.txt",sep="\t",header=T,row.names=1) OTU.dat = t(OTU_dat) OTU.dat[is.na(OTU.dat)] = 0 # for group # # Our dataset included 6 sampling sites, and each one had 21 nested designed samples. # The following code is to group the samples in each site together. It should be revised according to your sampling design and sample names. samp = rownames(OTU.dat) grp1 = mixedsort(samp[grep("B",samp)]); grp1 = c(grp1[length(grp1)], grp1[1:(length(grp1)-1)]) grp2 = mixedsort(samp[grep("L",samp)]); grp2 = c(grp2[length(grp2)], grp2[1:(length(grp2)-1)]) grp3 = mixedsort(samp[grep("^C",samp)]); grp3 = c(grp3[length(grp3)], grp3[1:(length(grp3)-1)]) grp4 = mixedsort(samp[grep("A",samp)]); grp4 = c(grp4[length(grp4)], grp4[1:(length(grp4)-1)]) grp5 = mixedsort(samp[grep("H",samp)]); grp5 = c(grp5[length(grp5)], grp5[1:(length(grp5)-1)]) grp6 = mixedsort(samp[grep("^N",samp)]); grp6 = c(grp6[length(grp6)], grp6[1:(length(grp6)-1)]) grp = list(grp1, grp2, grp3, grp4, grp5, grp6) names(grp) = order1 = c("BCI", "LUQ", "CWT", "AND", "HFR", "NWT") grplist = c(rep("BCI",21), rep("LUQ",21), rep("CWT",21), rep("AND",21), rep("HFR",21), rep("NWT",21)) grp.order = c(grp1, grp2, grp3, grp4, grp5, grp6) # Distance distance = c(2, 20, 100, 200, 400) area = distance * distance/2 #===================================== #--Chao1 based theoretical OTU number---- #===================================== z.chao1 = logc.chao1 = p.chao1 = c() chao1.samp = c() richness.chao1 = c() for(i in 1:6){ commu = OTU.dat[grp[[i]],] chao1.rich = ace.rich = c() est.rich = estimateR(commu) chao1.samp = c(chao1.samp, est.rich[2,]) for (j in 1:5){ sub_commu = commu[1:(j*4+1),] pooled.commu = colSums(sub_commu) est.rich = estimateR(pooled.commu) chao1.rich = c(chao1.rich, est.rich[2]) } richness.chao1 = cbind(richness.chao1, chao1.rich) lm1 = lm(log10(chao1.rich)~log10(area)) z.chao1 = c(z.chao1, as.numeric(lm1$coefficients[2])) logc.chao1 = c(logc.chao1, as.numeric(lm1$coefficients[1])) z.chao1.null = c() for (aa in 1:1000){ chao1.null = sample(chao1.rich, replace=T) lm1 = lm(log10(chao1.null)~log10(area)) z.chao1.null = c(z.chao1.null, as.numeric(lm1$coefficients[2])) } ttest1 = t.test(z.chao1.null, mu = z.chao1[i]) p.chao1 = c(p.chao1, ttest1$p.value) } p.chao1 #--plot for Fig. 1B ----- library("ggplot2") #---linear on Tax-area relationship---- richness = richness.chao1 #ace # dat.mean.lg = log10(richness) area.lg = log10(area) colnames(dat.mean.lg) = names(grp) dat.4plot = c() for(i in 1:ncol(dat.mean.lg)){ dat = cbind(area.lg, dat.mean.lg[,i]) dat.4plot = rbind(dat.4plot, dat) } dat.4plot = as.data.frame(dat.4plot) colnames(dat.4plot) = c("area.lg", "lg.rich") site.list = rep(names(grp), each = 5) g1 = ggplot(dat.4plot, aes(x=area.lg, y=lg.rich, color=site.list, size=3))+ geom_point()+ geom_smooth(method="lm", size=1,fill=NA,fullrange=TRUE)+ xlab("log(area)") + ylab("log(species richness)")+ scale_x_continuous(expand=c(0,0), limits=c(0,5.2)) + theme_bw() print(g1+theme(axis.title=element_text(size=24),axis.text=element_text(size=20),legend.text=element_text(size=20),legend.key=element_rect(fill=NA),panel.grid.major = element_blank(), panel.grid.minor = element_blank())) #================================================================ #---Observed empicical OTUs (Through 100 times of resampling)------- #================================================================ richness.obs = list() z.obs = c() R2.obs = c() n.obs = c() logc.obs = c() p.obs = c() H.obs = c() D.obs = c() J.obs = c() OTU.num.obs = c() ### Random sampling 100 times ### ### Here needs quite long time, so we prefer to save the results as a R image and load it for following calculation### OTU.list = colnames(OTU.dat) reads.num = rowSums(OTU.dat) resample.size = min(reads.num) for(a in 1:100){ OTU.dat.resample = OTU.dat for(i in 1:nrow(OTU.dat)){ #resample each sample OTU.dat.vector = rep(OTU.list, OTU.dat[i,]) resample = sample(OTU.dat.vector, size = resample.size) freq = table(resample) OTU.dat.resample[i,] = 0 OTU.dat.resample[i, names(freq)] = freq } #--remove empty columns--- colsums = colSums(OTU.dat.resample) OTU.dat.resample = OTU.dat.resample[,which(colsums>0)] OTU.num = rowSums(OTU.dat.resample>0) n.obs = rbind(n.obs, tapply(OTU.num, grplist, mean)) #--calculate Diversity--- H = diversity(OTU.dat.resample) D = diversity(OTU.dat.resample, "simpson") J = H/log(OTU.num) # enveness H.obs = rbind(H.obs, H) D.obs = rbind(D.obs, D) J.obs = rbind(J.obs, J) OTU.num.obs = rbind(OTU.num.obs, OTU.num) #--calculate z values--- richness.obs.site = c() z.obs.site = logc.obs.site = c() R2.obs.site = c() for(i in 1:length(grp)){ # for each site commu = OTU.dat.resample[grp[[i]],] rich.obs = c() for (j in 1:5){ sub_commu = commu[1:(j*4+1),] # count species richness col.count = colSums(sub_commu >0 ) sp.rich.obs = length(which(col.count>0)) rich.obs = c(rich.obs, sp.rich.obs) } lm1 = lm(log10(rich.obs)~log10(area)) z.obs.site = c(z.obs.site, as.numeric(lm1$coefficients[2])) logc.obs.site = c(logc.obs.site, as.numeric(lm1$coefficients[1])) R2.obs.site = c(R2.obs.site, (cor(log10(rich.obs),log10(area)))^2) richness.obs.site = cbind(richness.obs.site, rich.obs) } colnames(richness.obs.site) = names(grp) richness.obs[[a]] = richness.obs.site z.obs = rbind(z.obs, z.obs.site) logc.obs = rbind(logc.obs, logc.obs.site) R2.obs = rbind(R2.obs, R2.obs.site) } colnames(z.obs) = colnames(logc.obs) = names(grp) #richness.obs cbind(colMeans(n.obs[,order1]), 10^colMeans(logc.obs), colMeans(z.obs)) cbind(colMeans(OTU.num.obs), colMeans(H.obs), colMeans(J.obs), colMeans(D.obs)) cbind(apply(OTU.num.obs, 2, sd), apply(OTU.num.obs, 2, sd), apply(H.obs, 2, sd), apply(J.obs, 2, sd), apply(D.obs, 2, sd)) save.image("16S_97sim.RData") #----------- # for plot Fig. 1A #----------- richness = richness.obs #richness.ace1 # z = colMeans(z.obs) #z.chao1) #z.ace1) # z.err = apply(z.obs, 2, sd) #z.chao1 #z.ace1 library("ggplot2") #---linear on Tax-area relationship---- #---modified for permutation richness = richness.obs #richness.chao1 # dat.mix = array(0, dim=c(5,6,100)) for(a in 1:100){ dat.mix[,,a] = richness[[a]] } dat.mean = dat.max = dat.min = matrix(0, ncol=6, nrow=5) for(x in 1:5){ for(y in 1:6){ dat.mean[x, y] = mean(dat.mix[x,y,]) dat.max[x, y] = max(dat.mix[x,y,]) dat.min[x, y] = min(dat.mix[x,y,]) } } dat.mean.lg = log10(dat.mean) dat.max.lg = log10(dat.max) dat.min.lg = log10(dat.min) area.lg = log10(area) colnames(dat.mean.lg) = names(grp) dat.4plot = c() for(i in 1:ncol(dat.mean.lg)){ dat = cbind(area.lg, dat.mean.lg[,i], dat.min.lg[,i], dat.max.lg[,i]) dat.4plot = rbind(dat.4plot, dat) } dat.4plot = as.data.frame(dat.4plot) colnames(dat.4plot) = c("area.lg", "lg.rich", "lg.rich.min", "lg.rich.max") site.list = rep(names(grp), each = 5) g1 = ggplot(dat.4plot, aes(x=area.lg, y=lg.rich, color=site.list))+ geom_point()+ geom_errorbar(aes(x=area.lg,ymin=lg.rich.min, ymax=lg.rich.max,color=site.list, width=0.2))+ stat_smooth(method="lm",fullrange=TRUE,fill=NA,size=1)+ # xlab("log(area)") + ylab("log(species richness)") + scale_x_continuous(expand=c(0,0), limits=c(0,5.2)) + theme_bw() print(g1+theme(axis.title=element_text(size=24),axis.text=element_text(size=20),legend.text=element_text(size=20),legend.key=element_rect(fill=NA),panel.grid.major = element_blank(), panel.grid.minor = element_blank())) z.perm.site = c() intercept.site = c() for(i in 1:ncol(dat.mean.lg)){ lm1 = lm(dat.mean.lg[,i]~area.lg) z.perm.site = c(z.perm.site, lm1$coefficients[2]) intercept.site = c(intercept.site, lm1$coefficients[1]) } names(intercept.site) = colnames(dat.mean.lg) intercept.site cor.test(intercept.site, temp ) c1 = mean(intercept.site) #================================ #---Fit different z model------- #================================ library(mmSAR) mods = c("power","expo","negexpo","monod","logist","ratio","lomolino","weibull") data(power);data(expo);data(negexpo);data(monod);data(ratio);data(logist);data(lomolino);data(weibull) z = colMeans(z.obs) for(i in 1:ncol(dat.mean)){ print(names(grp)[i]) lm1 = lm(log10(dat.mean[,i]) ~ log10(area)) #AIC of log-log aic = AIC(lm1) print(aic) dat4model = list(a = area, s = dat.mean[,i]) dat4SAR = list(data = data.frame(dat4model), name=names(grp)[i]) res = multiSAR(modelList=mods, dat4SAR) print(res$optimRes) } #====================================== #---Plot z values with temperature----- #====================================== temp = c(25, 23, 16, 8, 6, 2) # Input the temperatures here. The current data are not accurate temperatures in our sites (just for example) site.temp = temp names(site.temp) = c("BCI", "Luquillo", "Coweeta", "HJA", "Harvard", "Niwot") z = colMeans(z.obs) z.err = apply(z.obs, 2, sd) r = cor(z,site.temp) myline.fit = lm(z ~ site.temp) slope = myline.fit$coefficients[2] dat4plot = cbind(site.temp, z, z.err) ##----- plot for linear and non-linear trendline (Fig. 3) ggplot(data=as.data.frame(dat4plot), aes(x=site.temp, y=z))+ geom_point(aes(color=factor(1:6), size =3))+ geom_errorbar(aes(x=site.soil.temp,ymin=z-z.err, ymax=z+z.err,color=factor(1:6)))+ geom_smooth(method="lm", se=FALSE, color="blue", formula=y~x, size=1.1)+ geom_smooth(method="lm", se=FALSE, color="red", formula=y~log(x), size=1.1)+ #geom_smooth(method="nls", se=FALSE, color="black", formula='y~a*x^b', start=list(a=0.05, b=0.05), size=1.1)+ ggtitle(bquote(R^2~"="~.(round(r*r,digits=3))~", slope="~.(signif(slope,digits=3))))+ xlab("Anual average temperature") + ylab("z value") #================================== #-------Correlation test----------- #================================== env.site = read.table("Site_env.txt", sep="\t", header=T, row.names=1) env.variable = env.site[,c(1:15)] # Pick up some important env variables env.variable = scale(env.variable) # Standardize the environmental variables rep2 = c() for(i in 1:ncol(env.variable)){ cor = cor.test(z, env.variable[,i]) rep2 = rbind(rep2, c(cor$estimate, cor$p.value)) } rownames(rep2) = names(env.variable) rep2 #================================ #------Plant z value plots------- #================================ plant.z = c(0.3007, 0.311, 0.287, 0.222, 0.166, 0.09) # plant z values were calculated by Enquist's lab cor.test(plant.z, site.temp) r = cor(plant.z,site.temp) myline.fit = lm(plant.z ~ site.temp) slope = myline.fit$coefficients[2] dat4plot = cbind(plant.z, z, z.err) r = cor(plant.z,z) cor.test(plant.z, z) ggplot(data=as.data.frame(dat4plot), aes(x=plant.z, y=z))+ geom_point(aes(color=factor(1:6), size =3))+ geom_errorbar(aes(x=plant.z,ymin=z-z.err, ymax=z+z.err,color=factor(1:6)))+ geom_smooth(method="lm", se=FALSE, color="blue", formula=y~x)+ ggtitle(bquote(R^2~"="~.(round(r*r,digits=3))))+ xlab("Plant z value") + ylab("Microbial z value") #======================================== #------Multiple regression tree----- #======================================== library("mvpart") OTU.dat = read.table("OTU dataset after resampling.txt",sep="\t",header=T,row.names=1) env.sample = read.table("env variables by samples.txt",sep="\t",header=T,row.names=1) OTU.dat = t(OTU.dat) dim(OTU.dat) colnames(env.sample) mrt_re3 = mvpart(data.matrix(geochip)~TN+TC+PH+MAT_Sean+Plant_rich,group, xv="1se", xval=50, xvmult=10) summary(mrt_re3) #===================================== #---- Modeling the species richness--- #===================================== library("ggplot2") setwd("E:/backup-8.21.15/my work in OU/Lina_Metabolic theory/analysis/modeling") rm(list=ls(all=TRUE)) temp.max = 30 temp.min = 2 # All below formula were obtained through above Fig. 3 z.max1 = 0.0688 + 0.000584*temp.max z.min1 = 0.0688 + 0.000584*temp.min z.max2 = 0.0619 * temp.max^0.0894 z.min2 = 0.0619 * temp.min^0.0894 z.max3 = 0.0067*log(temp.max) + 0.0609 z.min3 = 0.0067*log(temp.min) + 0.0609 z.max = max(z.max1, z.max2, z.max3) z.min = min(z.min1, z.min2, z.min3) c1 = 10^4.352 # obtained from the average intercept of 6 sites samp1 = seq(0,12) area = c(10^samp1, 3.28*10^12) sp.rich.max = c1*(area^z.max) sp.rich.min = c1*(area^z.min) sp.rich.max1 = c1*(area^z.max1) sp.rich.min1 = c1*(area^z.min1) sp.rich.max2 = c1*(area^z.max2) sp.rich.min2 = c1*(area^z.min2) sp.rich.max3 = c1*(area^z.max3) sp.rich.min3 = c1*(area^z.min3) dat4plot = cbind(area, sp.rich.max, sp.rich.min, sp.rich.max1, sp.rich.min1, sp.rich.max2, sp.rich.min2, sp.rich.max3, sp.rich.min3) number_ticks <- function(n) {function(limits) pretty(limits, n)} ggplot(as.data.frame(dat4plot), aes(x=area, y=sp.rich.max1/1000)) + geom_line(colour="blue") + geom_line(aes(x=area, y=sp.rich.min1/1000), color="blue") + geom_line(aes(x=area, y=sp.rich.max2/1000), color="yellow") + geom_line(aes(x=area, y=sp.rich.min2/1000), color="yellow") + geom_line(aes(x=area, y=sp.rich.max3/1000), color="red") + geom_line(aes(x=area, y=sp.rich.min3/1000), color="red") + geom_vline(xintercept = max(area), linetype="dotted") + geom_ribbon(aes(x=area, ymin=sp.rich.min1/1000, ymax = sp.rich.max1/1000), fill="blue", alpha=0.2) + geom_ribbon(aes(x=area, ymin=sp.rich.min2/1000, ymax = sp.rich.max2/1000), fill="yellow", alpha=0.2) + geom_ribbon(aes(x=area, ymin=sp.rich.min3/1000, ymax = sp.rich.max3/1000), fill="red", alpha=0.2) + scale_x_log10(breaks=area[-length(area)])+ scale_y_log10(breaks=c(1, 50,100,150, 200, 250, 300, 350))+ #breaks=number_ticks(5) xlab("Area (m^2)")+ ylab("OTU richness (x1000)")+ theme(axis.text.x = element_text(angle=90), axis.text=element_text(size=16), axis.title=element_text(size=20, face="bold"))