This file contains scripts and information for the manuscript:
“Self-compatibility in yeast is selected for reproductive assurance not population level compatibility”
Note: R code was run locally using Rstudio
Bash scripts were run either on the BioHPC cluster at LRZ, or on a local cluster (corvus) at the Faculty of Biology of LMU.
In the folder
/data/home/wolfproj/wolfproj-13/switching/analyses/ we get
all the files we want to analyze and we split these into 5 separate
batches to run in parallel:
for F in /home/wolfproj/tmp/Bart/pombe/natural_isolates/fastq/*_1.fastq.gz; \
do base=${F##*/}; \
echo ${base/_1.fastq.gz/} >> files.txt; \
done
wc -l files.txt # returns 357 so we need 357/5=71 --> 71+1 lines per file
split --lines=72 files.txt
Then run the script below using:
for batch in x*; do sbatch ../scripts/pipeline.sh $batch; done
The script below (corvus cluster) takes in as a parameter the file name containing the file names without the information of the paired end and file type information of the fastq files as generated above.
#!/bin/bash
#SBATCH --job-name="map_P&M"
#SBATCH --mem=8000
#SBATCH --mail-user=nieuwenhuis@bio.lmu.de
#SBATCH --mail-type=end
#SBATCH --time=2-00:00:00
#SBATCH --cpus-per-task=1
module load samtools/1.4.1
module load bwa/0.7.15
module load bbmap/37.28
module load bedtools/2.26.0
INP=$1
## We will be mapping all reads both to an h90 reference genome with a
## plus cassette at the mat1 locus and with a minus cassette at this locus,
## to avoid bias from using one or the other as a reference. These references
## do not contain the mating type "chromosome" MTR from the reference genomes.
## These are references as described in Nieuwenhuis, Tusso et al. 2018.
#bwa index ../references/h90_full_M.fasta
#bwa index ../references/h90_full_P.fasta
DIRIN="/data/home/wolfproj/tmp/Bart/pombe/natural_isolates/fastq/"
## most intermediate files are remove, but some we want to keep
## to avoid having to repeat everything again...
# mkdir -p fastq # to store the to be mapped fastq files
# mkdir -p coverages # to store coverage info files
# mkdir -p assigned_reads # to store the mat2P or mat3M reads
# mkdir -p short_samfiles # to store the samfile with reads in the mat23 region
while read f; do
## map to references
bwa mem ../references/h90_full_M.fasta $DIRIN/${f}_1.fastq.gz $DIRIN/${f}_2.fastq.gz > ${f}_M.sam
bwa mem ../references/h90_full_P.fasta $DIRIN/${f}_1.fastq.gz $DIRIN/${f}_2.fastq.gz > ${f}_P.sam
## change to bam
samtools view -bS ${f}_M.sam > ${f}_M.bam && rm ${f}_M.sam
samtools view -bS ${f}_P.sam > ${f}_P.bam && rm ${f}_P.sam
## sort and index bamfile
samtools sort ${f}_M.bam -o ${f}_M_sort.bam && rm ${f}_M.bam
samtools sort ${f}_P.bam -o ${f}_P_sort.bam && rm ${f}_P.bam
samtools index ${f}_M_sort.bam
samtools index ${f}_P_sort.bam
## remove intermediate files
#rm ${f}_M.sam ${f}_M.bam ${f}_P.sam ${f}_P.bam
## get coverage of the region upstream of the mating type
samtools depth -aa -r II:2000000-2100000 ${f}_M_sort.bam > ${f}_M.coverage
samtools depth -aa -r II:2000000-2100000 ${f}_P_sort.bam > ${f}_P.coverage
## get coverage of the region right before the mating type
samtools depth -aa -r II:2113449-2114449 ${f}_M_sort.bam > ${f}_M_RTS1.coverage
samtools depth -aa -r II:2113449-2114449 ${f}_P_sort.bam > ${f}_P_RTS1.coverage
#cut down bam file to chrII sub region for faster grepping of readIDs
samtools view -h ${f}_M_sort.bam "II:2113000-2133503" > ${f}_mergedPM.sam
samtools view -h ${f}_P_sort.bam "II:2113000-2133503" >> ${f}_mergedPM.sam
#extract readIDs into text file by RTS1 region IDs
samtools view -f 0x1 ${f}_M_sort.bam "II:2113093-2114093" | cut -f1 > ${f}_readID.txt
samtools view -f 0x1 ${f}_P_sort.bam "II:2113093-2114093" | cut -f1 >> ${f}_readID.txt
sort -u ${f}_readID.txt > ${f}_uniq_readID.txt && rm ${f}_readID.txt
## make merged samfile with all potential reads of interest
# first get header from samfile (first 6 lines, doesn't matter P or M)
# to create emtpy sam file to put grepped reads into
grep '^@' -m6 ${f}_mergedPM.sam > ${f}_mat1.sam
# then grep mate pairs of RTS1 reads (use LC_ALL=C to speed up grep)
LC_ALL=C grep -w -F -f ${f}_uniq_readID.txt < ${f}_mergedPM.sam >> ${f}_mat1.sam
## Get the reads as fastq
# convert sam to bam
samtools view -bS ${f}_mat1.sam > ${f}_mat1.bam && rm ${f}_mat1.sam
# then extract the reads to obtain fastq in a paired and interleaved format
bedtools bamtofastq -i ${f}_mat1.bam -fq ${f}_mat1.fastq
## and cleanup again
rm ${f}*.bai ${f}*.bam
mv ${f}_mergedPM.sam short_samfiles
mv ${f}*_readID.txt short_samfiles
mv ${f}*.coverage coverages
# split reads into fastq files with reads that match better to 1) mat1 region where the
# mating type genes are masked by NNN or 2) a fasta file with the Plus and Minus genes.
# Ambiguous reads (equally mapping) or non-mapping reads are removed.
# "int=t" means that the input file is paired and interleaved
# "outu" indicates the name for the unmapped reads.
bbsplit.sh \
in=${f}_mat1.fastq \
ref=../references/mat2P3M.fasta,../references/aroundMat1_N.fasta \
ambiguous=toss ambiguous2=toss \
basename=${f}_out_%.fastq \
outu=${f}-clean.fastq \
int=t
rm ${f}-clean.fastq ${f}_out_aroundMat1_N.fastq
# split reads into fastq files with reads that better match either mat2P or mat3M
# a third file with ambiguous (equally mapping) or non-mapping reads is also generated.
# "int=t" means that the input file is paired and interleaved
# "outu" indicates the name for the unmapped reads.
bbsplit.sh \
in=${f}_mat2P3M.fastq \
ref=../references/mat2P.fasta,../references/mat3M.fasta \
ambiguous=toss ambiguous2=toss \
basename=${f}_out_2_%.fastq \
outu=${f}-clean.fastq \
int=t
## and cleanup again
rm ${f}-clean.fastq ${f}_out_mat2P3M.fastq
mv ${f}_out_*.fastq assigned_reads/
done < $INP
cd assigned_reads
for F in *.fastq; do
awk '{} END { if(NR>0) print FILENAME "," NR/4; else print FILENAME ",0" }' $F >> ../211112reads_per_strain.csv
done
# when all individual files are generated, we can analyse the reads per file using this script
# read IDs always start either with @ST or with @ERR
for F in *.fastq; do
awk '$1 ~/^@[SE][RT]/{seen[$0]++} END {print FILENAME "," length(seen)}' $F >> 220208uniqreads_per_strain.csv
done
These data are now further analysed in R (packages loaded:
library(tidyverse) and library(patchwork))
First the data is read in, and a \(chi^2\) is performed for each strain to find deviations from 1:1 (for only mat1 and for all loci). The results are then corrected for multiple testing using False Discovery Rate correction.
illreads_raw <-
## this file contains all the counts of the assigned number of reads
## assigned either to P or M.
## the file contains data for:
## - M reads at mat1 : out_mat3M
## - P reads at mat1 : out_mat2P
## - M reads at mat1 : out2_mat3M Double bbsplit filtered
## - P reads at mat1 : out2_mat2P Double bbsplit filtered
## - M reads in the entire library : outAll_mat3M
## - P reads in the entire library : outAll_mat2P
## - M reads in the entire library : outAll2_mat3M Double bbsplit filtered
## - P reads in the entire library : outAll2_mat2P Double bbsplit filtered
## It contains data both for the Jeffares et al. 2015 and
## for Tusso et al. 2019 (EBC strains)
read_csv("data/reads_counts/220208uniqreads_per_strain.csv", col_names = F)
# separate the file names into informative columns and remove uninfomative elements
illreads_raw <-
illreads_raw %>%
separate( X1, into= c( "strain", "accession", "measure", "mat","extension" ) ) %>%
filter( accession != "ERR499321" ) %>% ## JB22 strain that is something else...
select(-extension) %>%
rename(reads = X2) %>%
mutate( measure = as_factor( measure ))
# modify the columns to distinguish Jeffares strains from Tusso strains
illreads2 <-
illreads_raw %>%
mutate( EBCorJB = ifelse( str_detect(accession, "EBC" ),
"EBC",
"JB" ) ) %>%
pivot_wider( names_from = measure, values_from = reads) %>%
select(-outAll, -out ) ## we will not use the single filtered measures, as these are less precise
# combine the reads per sample and calculate ratios
illreads2 <-
inner_join( illreads2 %>% filter(mat == "mat2P"), ## get the reads for all measures
illreads2 %>% filter(mat == "mat3M"), ## in their own columns
by = c("strain", "accession", "EBCorJB" ),
suffix = c(".P",".M")) %>%
select(-mat.P,-mat.M) %>% ## remove no longer needed columns
mutate( strainLong = ## add some pretty name
as_factor(
ifelse( EBCorJB == "EBC",
paste0(strain,"_",accession),
strain ) ) ) %>%
mutate(mat1_reads = out2.P + out2.M, ## calculate reads at mat1
allmat_reads = outAll2.P + outAll2.M, ## calculate reads for whole genomes
freq_allP = outAll2.P/allmat_reads, ## calculate frequency P for whole genomes
freq_mat1P = out2.P/mat1_reads) ## calculate frequency P for mat1
illreads <- illreads2 %>%
filter(mat1_reads > 10) ### remove entries with fewer than 10 reads at mat1
## test for deviation from 0.5 at MAT1 using a chi2 for the data
# generate the right data frame
ill_chi <-
illreads2 %>%
group_by( strainLong, strain ) %>% ## if multiple accessions combine
summarize( allmat_reads_c = sum( allmat_reads, na.rm = T ),
mat1_reads_c = sum( mat1_reads, na.rm = T ),
mat1M = sum( out2.M, na.rm = T),
mat1P = sum( out2.P, na.rm = T),
allP = sum( outAll2.P, na.rm = T),
allM = sum( outAll2.M, na.rm = T),
freq_allP_c = sum(outAll2.P, na.rm = T)/allmat_reads_c,
freq_mat1P_c = sum(mat1P, na.rm = T)/mat1_reads_c
) %>% ungroup()
# Do the chi^2 tests for P/M at mat1 and for all reads.
pmat = pall = NULL
for(i in 1:nrow(ill_chi) ){
x <- ill_chi[i, ]$mat1M # reads at mat1
y <- ill_chi[i, ]$mat1P
testdat <- matrix( c( x , y ), nrow = 2 )
p = chisq.test( x = testdat )$p.value
#print(paste0(ill_chi$strainLong[i] , ", M: ", x, ", P: ", y, ", p-value = ", round( p, 3 ) ) )
pmat <- c(pmat, p)
x <- ill_chi[i, ]$allM # reads at all loci
y <- ill_chi[i, ]$allP
testdat <- matrix( c( x , y ), nrow = 2 )
pa = chisq.test( x = testdat )$p.value
#print(paste0(ill_chi$strainLong[i] , ", M: ", x, ", P: ", y, ", p-value = ", round( p, 3 ) ) )
pall <- c(pall, pa)
}
ill_chi <- ill_chi %>% mutate( pval = pmat, pvalAll = pall )
# perform false discovery rate correction for multiple testing.
psort <- sort(ill_chi$pval)
psortAll <- sort(ill_chi$pvalAll)
fdrtest <- fdrtestAll <- NULL
for (i in 1:nrow(ill_chi)){
fdrtest <- c(fdrtest, ill_chi$pval[i] > match(ill_chi$pval[i],psort) * .05/nrow(ill_chi))
fdrtestAll <- c(fdrtestAll, ill_chi$pvalAll[i] > match(ill_chi$pvalAll[i],psortAll ) * .05/nrow(ill_chi))
}
ill_chi <- ill_chi %>% mutate( fdrtest = fdrtest, fdrtestAll = fdrtestAll)
We can plot this data to get some information on the distribution among strains:
#### Fig 2a ####
# Plot the mating type allele frequency at mat1
# To avoid redundancy, only data from Jeffares et al 2015 is plotted.
F2a <-
ill_chi %>%
mutate( freq_mat1M = 1 - freq_mat1P_c ) %>% ## calculate the other half
mutate( ordervar = freq_mat1P_c ) %>% ## make a sorting variable
pivot_longer( names_to = "mat", ## transform to longer
values_to = "freqs_mat1",
cols = starts_with("freq_mat1")) %>%
filter( !str_detect( strainLong, "EBC")) %>%
mutate(strainLong = fct_reorder(strainLong, ordervar)) %>%
select( -allmat_reads_c, -freq_allP_c, -mat1_reads_c ) %>%
mutate(signif = ifelse( fdrtest, yes = "" , no = "*")) %>%
mutate(sigcols = ifelse( fdrtest, 0, freqs_mat1 ) ) %>%
ggplot( aes( x = strainLong, y = freqs_mat1 , fill = mat ) ) +
geom_bar( stat = "identity", width = 1) +
geom_bar( aes(x = strainLong, y = sigcols ), stat = "identity", fill = "white", alpha = 0.3) +
xlab("Strain") +
ylab(expression( "Frequency of allele at "*italic("mat1" ) ) ) +
scale_y_continuous(limits = c(0,1), expand = c(0, 0)) +
geom_text(x = 100, y=0.75, label="Minus", size=10) + # add minus title
geom_text(x = 110, y=0.25, label="Plus", size=10) + # add plus title
geom_hline( yintercept = 0.5) +
geom_text( aes( label = signif, y = 0.98 ) ) + # add significance asterisk
theme_bw() +
theme(legend.position = "none") +
theme( axis.text.x=element_text(angle=60,hjust=1, size = 4)) +
geom_point( data = ill_chi %>% ## add a point at the reference h90 strain.
filter( !str_detect( strainLong, "EBC")) %>%
mutate(strainLong = fct_reorder(strainLong, freq_mat1P_c)) %>%
mutate( point = ifelse(strainLong == "JB50", freq_mat1P_c,-1) ),
aes(x = strainLong, y = point ), fill = 1, shape=18
)
F2a<- annotate_figure(F2a, fig.lab = "A", fig.lab.face = "bold" )
F2a
ggsave("../Manuscript/Fig_2a.pdf" ,plot = F2a, units = "cm", width = 20, height = 10)
image-20220624151532821
We also added a point for the reference lab strain (JB50), for which the actual value is 0.487
# show the value for data point JB50
ill_chi %>% ## add a point at the reference h90 strain.
filter( !str_detect( strainLong, "EBC")) %>%
mutate(strainLong = fct_reorder(strainLong, freq_mat1P_c)) %>%
mutate( point = ifelse(strainLong == "JB50", freq_mat1P_c,-1) ) %>%
filter( strainLong == "JB50" ) %>% select(strainLong, freq_allP_c)
# strainLong freq_allP_c
# <fct> <dbl>
# JB50 0.487
To summarise these:
# Summarise the number of strains that deviate from 1:1
ill_chi %>%
filter( !str_detect( strainLong, "EBC")) %>%
filter( freq_mat1P_c != 0, freq_mat1P_c !=1 ) %>%
summarise( total = n(),
fdr = sum(!fdrtest),
MoreM = sum( freq_mat1P_c < 0.5 & !fdrtest),
MoreP = sum( freq_mat1P_c > 0.5 & !fdrtest))
# # A tibble: 1 x 4
# total fdr MoreM MoreP
# <int> <int> <int> <int>
# 131 41 34 7
Of the 161 strain, 131 strains are not heterothallic and of those, 41 are deviating. 7 have more P and 34 have more M reads at mat1.
To compare how the results at mat1 and at all loci (incl. mat1) are related, we can make this plot:
# Plot the mating type allele frequency at all loci
# Again, only for JB strains
FS2a <-
ill_chi %>%
mutate( freq_allM = 1 - freq_allP_c ) %>% ## calculate the other half
mutate( ordervar = freq_allP_c ) %>% ## make a sorting variable
pivot_longer( names_to = "mat", ## transform to longer
values_to = "freqs_all",
cols = starts_with("freq_all")) %>%
filter( !str_detect( strainLong, "EBC")) %>% ## remove EBC strains
mutate(strainLong = fct_reorder(strainLong, ordervar)) %>%
mutate(signif = ifelse(fdrtestAll,"",no = "*")) %>%
mutate(sigcols = ifelse(fdrtestAll, 0, freqs_all ) ) %>%
ggplot( aes( x = strainLong, y = freqs_all , fill = mat ) ) +
geom_bar( stat = "identity", width = 1) +
geom_bar( aes(x = strainLong, y = sigcols ), stat = "identity", fill = "white", alpha = 0.3) +
xlab("Strain") +
ylab(expression( "Frequency of allele at "*italic("mat1" ) ) ) +
scale_y_continuous(limits = c(0,1), expand = c(0, 0)) +
geom_hline( yintercept = 0.5) +
geom_text(x = 80, y=0.75, label="Minus", size=10) + # add minus title
geom_text(x = 90, y=0.25, label="Plus", size=10) + # add plus title
geom_hline( yintercept = 0.5) +
geom_text( aes( label= signif, y = 0.98)) + # add significance asterisk
theme_bw() +
theme( legend.position = "none" ) +
theme( axis.text.x = element_text( angle=60, hjust=1, size = 4 ) ) +
geom_point( data = ill_chi %>% ## add a point at the reference h90 strain.
filter( !str_detect( strainLong, "EBC")) %>%
mutate(strainLong = fct_reorder(strainLong, freq_allP_c)) %>%
mutate( point = ifelse(strainLong == "JB50", freq_allP_c,-1) ),
aes(x = strainLong, y = point ), fill = 1, shape=18
)
FS2a <- annotate_figure(FS2a, fig.lab = "A", fig.lab.face = "bold")
FS2a
ggsave("../Manuscript/Fig_S2a.pdf" ,plot = FS2a, units = "cm", width = 20, height = 10)
image-20220624152720702
Many strains show more P alleles when considering all loci. We can plot this and the alleles at mat1 against each other:
### to check if there is a correlation between the total number of mat reads and
### the reads at mat1, this plots the freq P at both against each other,
### without EBC measurements.
FS2c <-
ill_chi %>%
filter( !str_detect( strainLong, "EBC")) %>%
ggplot(aes( x = freq_allP_c , y = freq_mat1P_c, size = mat1_reads_c)) +
geom_point() +
guides(size=guide_legend(title=expression("# "*italic("mat1P") *" reads") ) )+
xlab("Frequency of all P reads") +
ylab(expression( "Frequency of P reads at "*italic("mat1" ) ) ) +
scale_y_continuous(limits = c(0,1)) +
scale_x_continuous(limits = c(0,1)) +
geom_smooth( method = 'lm', size = 1, se = F) +
geom_smooth(data =
ill_chi %>%
mutate( freq_not_mat1_P =
((allmat_reads_c * freq_allP_c ) - # all P reads
(mat1_reads_c * freq_mat1P_c )) / # all mat1 - P reads
(allmat_reads_c - mat1_reads_c) # all reads not at mat1
) %>% filter( freq_not_mat1_P > 0.05, freq_not_mat1_P < 0.90)
,aes(freq_not_mat1_P,freq_mat1P_c),
method = 'lm', size = 1, se = F, color = 'red', linetype="longdash") +
theme_bw() +
theme( axis.text.x=element_text(angle=60,hjust=1))
FS2c <- annotate_figure(FS2c, fig.lab = "C", fig.lab.face = "bold")
FS2c
ggsave("../Manuscript/Fig_S2c.pdf" ,plot = FS2c, units = "cm", width = 13, height = 10)
image-20220624152948772
These results suggest a correlation between the number of P reads and the skew at mat1, however the overall number of reads is skewed to P (most data is to the right of 0.5). The lines show this with and without the presumed heterothallic strains and show strong significant correlations:
## test significance using binomal glm
cordata <- ill_chi %>%
mutate( freq_not_mat1_P =
((allmat_reads_c * freq_allP_c ) - # all P reads
(mat1_reads_c * freq_mat1P_c )) / # all mat1 - P reads
(allmat_reads_c - mat1_reads_c) # all reads not at mat1
) %>% filter( !str_detect( strainLong, "EBC"))
mod_without_heterothallic <- glm( cbind(mat1P, mat1M ) ~ freq_allP_c , family = binomial(),
data= filter( cordata, freq_not_mat1_P > 0.05, freq_not_mat1_P < 0.90 ))
summary(mod_without_heterothallic)
plot(mod_without_heterothallic)
#Call:
#glm(formula = cbind(mat1P, mat1M) ~ freq_allP_c, family = binomial(),
# data = filter(cordata, freq_not_mat1_P > 0.05, freq_not_mat1_P < 0.9))
#Deviance Residuals:
# Min 1Q Median 3Q Max
#-9.5021 -1.7844 -0.3647 0.5767 16.0874
#
#Coefficients:
# Estimate Std. Error z value Pr(>|z|)
#(Intercept) -3.0266 0.1746 -17.33 <2e-16 ***
#freq_allP_c 5.1752 0.3188 16.23 <2e-16 ***
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#(Dispersion parameter for binomial family taken to be 1)
#
# Null deviance: 1623.7 on 147 degrees of freedom
#Residual deviance: 1341.4 on 146 degrees of freedom
#AIC: 1859.3
#Number of Fisher Scoring iterations: 4
mod_incl_heterothallic <- glm( cbind(mat1P, mat1M ) ~ freq_allP_c , family = binomial(),
data= cordata)
summary(mod_incl_heterothallic)
plot(mod_incl_heterothallic)
#Call:
#glm(formula = cbind(mat1P, mat1M) ~ freq_allP_c, family = binomial(),
# data = cordata)
#
#Deviance Residuals:
# Min 1Q Median 3Q Max
#-9.878 -1.641 -0.667 0.590 16.559
#
#Coefficients:
# Estimate Std. Error z value Pr(>|z|)
#(Intercept) -3.7175 0.1388 -26.78 <2e-16 ***
#freq_allP_c 6.4242 0.2558 25.12 <2e-16 ***
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#(Dispersion parameter for binomial family taken to be 1)
#
# Null deviance: 2607.8 on 160 degrees of freedom
#Residual deviance: 1401.4 on 159 degrees of freedom
#AIC: 1919.2
#Number of Fisher Scoring iterations: 5
One possibility might be that the skew in ‘all’ reads is associated with the copy number of the mating type cassettes, however, this does not seem to hold. We can test this using relative coverage of P and M cassettes (normalized by mean coverage outside of mating type), from Nieuwenhuis, Tusso et al. 2018.
#### Copies per region ####
regions <- read_csv("data/jeffares_coverage/Coverages_per_sample.csv",
col_types = "ffddddddddddddddddddddd") %>%
filter( sample_ID != "ERR499321" ) %>%
mutate( matP_freq = Mat_Plus_rel/(Mat_Minus_rel + Mat_Plus_rel ) )
# join this data with the data from the Illumina results
regions <-
inner_join( regions, filter( combined, mated == "mated" ) ) %>%
filter(strain != "JB22") %>%
mutate( mat1P = freq_mat1P * mat1_reads,
all_P = freq_allP * allmat_reads) %>%
group_by( strain ) %>%
summarize( mat1P = sum(mat1P),
all_P = sum(all_P),
mat1_reads = sum(mat1_reads),
allmat_reads = sum(allmat_reads),
base_coverage = mean( base_coverage),
L_region = mean( L_region_rel ),
Mat_Minus = mean( Mat_Minus_rel ),
Mat_Plus = mean( Mat_Plus_rel ),
K_region = mean( K_region_rel ),
IR_regions = mean( IR_regions_rel )
) %>%
mutate( freq_mat1P = mat1P/mat1_reads,
freq_allP = all_P/allmat_reads)
FS2b<-
regions %>%
ggplot( aes(x = Mat_Plus, y = K_region, size = allmat_reads)) +
geom_smooth( method = "lm", show.legend = F, size = 1, se = F ) +
geom_point() +
ylab(expression( "Relative coverage of "*italic("K" )*"-region" ) ) +
xlab(expression( "Relative coverage of P cassette" ) ) +
guides(size=guide_legend(title=expression( "# "*italic("P" )*" reads" ))) +
theme_bw()
FS2b <- annotate_figure(FS2b, fig.lab = "B", fig.lab.face = "bold")
FS2b
ggsave("../Manuscript/Fig_S2b.pdf" ,plot = FS2b, units = "cm", width = 13, height = 10)
image-20220624153459129
This shows that the relative coverage of the Plus cassette is closely associated with the K region, suggesting repeats of the entire silent mating type region. This is not the case for the Minus cassettes, which generally is not duplicated when the silent region is duplicated.
regions %>%
ggplot( aes(x = Mat_Minus, y = K_region, size = allmat_reads)) +
geom_smooth( method = "lm", show.legend = F, size = 1, se = F ) +
geom_point() +
ylab(expression( "Relative coverage of "*italic("K" )*"-region" ) ) +
xlab(expression( "Relative coverage of M cassette" ) ) +
guides(size=guide_legend(title=expression( "# "*italic("P" )*" reads" ))) +
theme_bw()
image-20220628185806029
Finally, comparing the relative coverage at P for all loci with the allele frequency of P at mat1 shows that the latter has a large range of frequencies, from 0 to 1, and within the expected 1.5 coverage (‘wild type’ copy number with P at mat2 and sometimes at mat1) though smaller, the range still goes from ~0.15 to 0.9.
#### Fig 2c - all v mat1 ####
F2c <-
regions %>%
ggplot( aes(y = freq_mat1P, x = Mat_Plus, size = allmat_reads)) +
geom_point() +
ylab(expression( "Frequency of P at "*italic("mat1" ) ) ) +
xlab(expression( "Relative coverage of P at all loci" ) ) +
guides(size=guide_legend(title=expression( "# "*italic("P" )*" reads" ))) +
theme_bw()
F2c <- annotate_figure(F2c, fig.lab = "C", fig.lab.face = "bold")
F2c
ggsave("../Manuscript/Fig_2c.pdf" ,plot = F2c, units = "cm", width = 13, height = 10)
image-20220628185709614
We performed a check to see if the RTS1 regions is a single copy region by comparing coverage of this region to the mean coverage of the strain (see script above). We aggregate all values per strain into one file:
# when all individual files are generated, we can analyse the results using this script
for F in *.coverage; do
awk '{ total += $3 } END { if(NR>0) print FILENAME total/NR; else print FILENAME " 0" }' $F >> coverage.txt
done
And visualize this in R using:
library(tidyverse)
#### Coverage for RTS1 region ####
cover <- read_delim( "data/coverages/coverage.txt", col_types = "fn", delim = " ", col_names = F )
names(cover) <- c("strain", "coverage")
cover <- cover %>%
separate( strain, into = c("strain", "accession", "ref","region"),extra = "drop" ) %>%
mutate( region = ifelse( region == "coverage", "II100kb", region ) )
mean_cover <- cover %>% group_by(strain, ref, region) %>%
summarize( m_cover = mean(coverage)) %>%
ungroup() %>%
mutate( strain = as.factor(strain)) %>%
pivot_wider(values_from = m_cover, names_from = "region" )
#### Fig S1 RTS1 ####
FS1 <-
cover %>%
group_by(strain, region) %>%
summarize( m_cover = mean(coverage)) %>%
ungroup() %>%
mutate( strain = as.factor(strain)) %>%
pivot_wider(values_from = m_cover, names_from = "region" ) %>%
ggplot(aes(x = strain, y = RTS1/II100kb )) +
geom_point() +
ylim(0,1.2) +
theme_bw() +
ylab(expression( "Normalized coverage at "*italic("RTS1" )*" region" ) ) +
xlab("Strain") +
ggtitle('Fig. S1') +
theme( axis.text.x=element_text(angle=60,hjust=1, size = 5),
panel.grid.major.x = element_blank(),
plot.title = element_text(hjust = -0.08, vjust=0.12))
FS1
ggsave("../Manuscript/Fig_S1.pdf", plot = FS1, units = "cm", width = 20, height = 10)
image-20220624150947577
To test if the illumina pipeline works well, we compare this to the PacBio data that we have for the same strains (EBC### strains sequenced at SciLife in 2018; published in Tusso et al. 2019).
We start with mapping the reads to the reference genomes using
bwa mem -x pacbio and then extract all reads that map to
the RTS1 region (adjacent to mat1) and use blast to see which
allele is located next to this region.
#!/bin/bash
#SBATCH --job-name="map_pb_bwa"
#SBATCH --mem=8000
#SBATCH --mail-user=XXXX
#SBATCH --mail-type=end
module load bwa/0.7.15
module load samtools/1.4.1
module load bbmap/37.28
module load bedtools/2.26.0
DATADIR="/data/home/wolfproj/tmp/Bart/pombe/raw-data/"
REFDIR="/data/home/wolfproj/wolfproj-13/switching/references"
OUTPUTDIR="/data/home/wolfproj/wolfproj-13/switching/pacbio"
SCRIPTSDIR="/data/home/wolfproj/wolfproj-13/switching/scripts"
reference="$REFDIR/$2"
while read $f; do
name=${f%% *}
file=${f##* }
Read1="$DATADIR/${file}_filtered_subreads.fastq.gz"
Outputfile="${name}"
ReadGroup="@RG\tID:$name\tSM:Sample\tPL:PACBIO\tLB:lib1\tPU:unit1"
# this read group header will be added to SAM and BAM and is neccesary for GATK
if [[ ! -e $Read1 ]] ; then
echo "$Read1 forward reads file NOT FOUND"
fi
## map to reference
bwa mem -x pacbio -t 4 $reference $Read1 > ${Outputfile}.sam
# minimap2 -ax map-ont $reference $Read1 > ${Outputfile}.sam # for Oxford Nanopore reads
## convert to bam and remove sam
samtools view -S -b -F 4 ${Outputfile}.sam > ${Outputfile}_mapped.bam && rm ${Outputfile}.sam
## then extract the reads to obtain fastq
bedtools bamtofastq -i ${Outputfile}_mapped.bam -fq $OUTPUTDIR/${Outputfile}_mapped.fastq
# using reformat from bbmap
reformat.sh in=${Outputfile}_mapped.fastq out=${Outputfile}.mapped_pb.fasta
done < $1
This script can be run using.
sbatch strainlist.txt around_matingtype_100kb.fasta
Now that we have the reads of interest, we will look for the region
within each read that has the unique sequence that is centromere
proximate of mat1. We will perform blast using as
library (kb_left_of_mat.fasta) the following fragments:
1kb_Left_ofMat1 II:2113093-2114093
IRL
mat1-Mc - SPBC23G7.09
mat2-Pi - SPMTR.01.1 Reverse Complement (as in genome)
mat2-Pc - SPMTR.02
mat1-Mi - SPBC23G7.17c Reverse Complement (as in genome)
#!/bin/bash
#SBATCH --job-name="blast_pb"
#SBATCH --mem=8000
#SBATCH --mail-user=XXXX
#SBATCH --mail-type=end
module load blast/2.6.0+
DATADIR="/data/home/wolfproj/wolfproj-13/switching/analyses/pacbio"
REFDIR="/data/home/wolfproj/wolfproj-13/switching/references"
OUTPUTDIR="/data/home/wolfproj/wolfproj-13/switching/analyses/pacbio"
SCRIPTSDIR="/data/home/wolfproj/wolfproj-13/switching/scripts"
reference="$REFDIR/$1"
makeblastdb -in $reference -dbtype nucl -title mat1_left
ls $DATADIR | grep mapped_pb.fasta > pbfiles.txt
while read F; do
base=${F##*/}
echo "blast for file $base with number of reads"
wc -l $F
blastn -db $reference -query $F \
-outfmt "6 qseqid sseqid pident qlen length qstart qend sstart send evalue" \
-perc_identity 80 > $OUTPUTDIR/${base%.fasta}.txt
done < pbfiles.txt
This will be plotted in R using the script.
## plot data from blast of the regions around the mating type from pacbio data
library(tidyverse)
# Read in all the files
pbblast <- list.files(pattern = "*.mapped_pb.txt",
path = "data/pacbio", full.names = T) %>%
map_df(~ mutate(read_table(., col_names = F, col_types = "cfnnnnnnnn"),
fileName = . ) ) %>%
mutate( strain = basename(fileName)) # add shorter strain name
## rename columns
colnames(pbblast) <- c("qseqid","sseqid","pident","qlen","length",
"qstart","qend","sstart","send","evalue",
"fileName", "strain")
## add leading 0s for each strain
pbblast <- pbblast %>%
mutate( strain = paste0( "EBC",
str_pad(str_extract(strain,"(\\d)+"),
3, pad = "0") ) )
## describe the orientation of the query by looking at the size of sstart and
## send and set this in a new variable (Fwd = 1, Rev = -1)
pbblast <- pbblast %>% mutate( orientation = ifelse( sstart < send, 1,-1))
pbblast <- pbblast %>%
group_by( qseqid ) %>%
filter(qlen == max(qlen))
# shorten the contig ID and remove IRL hits
pbblast <- pbblast %>%
separate( qseqid, into = c("long","X1","X2"), sep = "/") %>%
unite("qseqid", c(X1, X2), sep = "/", na.rm = TRUE ) %>%
select( -long, -fileName )
pbblast
## Add information on start for the strains that have 1kb_Left_ofMat1 region
pbblast <-
pbblast %>% filter(sseqid == "1kb_Left_ofMat1") %>%
mutate(zero = ifelse(orientation > 0, qstart - sstart + 1, -1 *(qend + send -1 ) ) ) %>%
select(qseqid, zero ) %>%
group_by( qseqid ) %>% summarise( zero = abs( min( zero ) ) ) %>%
left_join( pbblast, . , by = c("qseqid") ) #%>% filter( sseqid =="1kb_Left_ofMat1" )
pbblast
## calculate the start and end position for each fragment relative to zero which is the
## area 1kb_Left_ofMat1
pbblast <-
pbblast %>% mutate( relstart =
ifelse(orientation>0, qstart-zero, zero - qend + 1 ),
relend =
ifelse(orientation>0, qend-zero, zero - qstart )
)
mycolors = c("cyan", "maroon", "lawngreen", "springgreen1","red", "black")
#### In the next part we will plot what each read looks like for the
#### region right next to "1kb left of mat1"
## make a plot for each strain independently and store that as a jpeg.
for( st in levels(factor(pbblast$strain) ) ){
# get only the reads within this one individual
perstrain <- filter( pbblast, strain == st) %>%
filter(!is.na(zero)) %>%
filter(abs( send - sstart) > 100 ) %>%
mutate(qseqid = factor(qseqid))
# check the number of contigs within this individual
contigstot = length(levels(perstrain$qseqid))
maxRelEnd = max( perstrain$relend )
# make an empty plot of the appropriate dimentions for this individual
jpeg(filename = file.path( "figures" ,
gsub(x = st, pattern = ".mapped_pb.txt", ".jpg" ) ),
width = 1000, height = 800)
par(mar = c(3, 10 ,3,3) ) # Change the margin on each side
# make the empty plot
plot(x = 0, y = 0, type="n",
xlim = c( 0, maxRelEnd), ylim = c(1,contigstot),
xlab = "position in bp", ylab = "",
main = paste0("Strain ", gsub(x = st, pattern = ".mapped_pb.txt", "" )),
yaxt='n')
# add the name for each of the reads
for(fragment in 1:length( levels(perstrain$sseqid ) ) ) {
mtext(levels(perstrain$sseqid )[fragment], col = mycolors[ fragment ],
adj = -.1+ fragment/length( levels(perstrain$sseqid ) ),
font = 2)
}
# draw the segments as lines
for( ct in 1:contigstot ) {
percontig <- filter( perstrain, qseqid == levels( perstrain$qseqid )[ct] )
percontig <- percontig %>% filter(percontig$qlen == max( percontig$qlen ) )
for(i in 1:nrow(percontig ) ) {
par(xpd= FALSE)
lines( x=c(percontig$relstart[i], percontig$relend[i]), y = c(ct, ct), col = mycolors[percontig$sseqid[i]], lwd = 2 )
points(x=c(percontig$relstart[i], percontig$relend[i]), y = c(ct, ct), col = mycolors[percontig$sseqid[i]], pch = 16)
par(xpd=NA)
text( x = (maxRelEnd * -0.05), y = ct, labels = levels( perstrain$qseqid )[ct], adj = 1 )
}
}
dev.off()
}
This is an example output:
We can count and then analyze the data for all strains combined.
## Count the number of reads that have either a Plus or a Minus in
## the read close to the 'zero' point that resembles the 1kb region next to mat1
pacbio_results <- pbblast %>%
filter(!is.na(zero)) %>%
filter(abs( send - sstart) > 100 ) %>%
filter( relstart < 3000 ) %>%
mutate(qseqid = factor(qseqid)) %>%
group_by( strain, qseqid ) %>%
summarise( minus = sum( grepl("mat1", sseqid) > 0 ),
plus = sum( grepl("mat2", sseqid) ) > 0) %>%
group_by( strain ) %>%
summarise( mat1_M = sum(minus) , mat1_P = sum(plus) ) %>%
inner_join(. ,
pbblast %>%
filter(abs( send - sstart) > 200 ) %>%
mutate(qseqid = factor(qseqid)) %>%
group_by( strain, qseqid ) %>%
summarise( minus = sum( grepl("mat1", sseqid) > 0 ),
plus = sum( grepl("mat2", sseqid) ) > 0) %>%
group_by( strain ) %>%
summarise( tot_minus = sum(minus) , tot_plus = sum(plus) ) ) %>%
mutate(strain = gsub( ".mapped_pb.txt", "", strain ))
## plot these results for the total number of reads with P or M together
## with the reads in the assumed mat1 region
pb1<- pacbio_results %>%
mutate( strain = factor(strain)) %>%
mutate( P_mat1 = mat1_P / (mat1_M + mat1_P),
P_alleles_tot = tot_plus / ( tot_minus + tot_plus ) ) %>%
pivot_longer(cols = P_mat1:P_alleles_tot, names_to = "name", values_to = "value") %>%
mutate(strain = fct_reorder2(strain, name, value)) %>%
ggplot( aes( x = strain, y = value , fill = name) ) +
geom_bar(position = "dodge", width = 0.8, stat = "identity") +
theme( axis.text.x=element_text(angle=45,hjust=1))
## here we only plot the reads in mat1
pb2<- pacbio_results %>%
mutate( strain = factor(strain)) %>%
mutate( P_mat1 = mat1_P / (mat1_M + mat1_P),
P_alleles_tot = tot_plus / ( tot_minus + tot_plus ) ) %>%
pivot_longer(cols = P_mat1:P_alleles_tot, names_to = "name", values_to = "value") %>%
mutate(strain = fct_reorder2(strain, name, value)) %>%
filter( name == "P_mat1") %>%
ggplot( aes( x = strain, y = value ) ) +
geom_bar(position = "dodge", width = 0.8, stat = "identity") +
theme( axis.text.x=element_text(angle=45,hjust=1))
pb1 + pb2
image-20220624192510534
The data can now be combined with those from the Illumina data and tested for a correlation. This is done both with and without the heterothallic strains:
EBCcombined <- left_join( mutate(pacbio_results, accession = strain), illreads2, by = "accession" ) %>%
mutate(PB_freq_mat1P = mat1_P /(mat1_M + mat1_P),
PB_freq_allP = tot_plus /(tot_minus + tot_plus),
IL_freq_mat1P = out2.P /(out2.P + out2.M),
IL_freq_allP = outAll2.P /(outAll2.M + outAll2.P ),
allReadsMat1 = mat1_P+mat1_M,
allReads = tot_plus+tot_minus)
mod <- aov( PB_freq_mat1P ~ IL_freq_mat1P, data= EBCcombined)
summary(mod)
# Df Sum Sq Mean Sq F value Pr(>F)
# IL_freq_mat1P 1 1.9868 1.9868 193.7 4.49e-11 ***
# Residuals 18 0.1847 0.0103
mod <- aov( PB_freq_mat1P ~ IL_freq_mat1P,
data= filter( EBCcombined, PB_freq_mat1P != 0, PB_freq_mat1P != 1) )
summary(mod)
# Df Sum Sq Mean Sq F value Pr(>F)
# IL_freq_mat1P 1 0.1249 0.12487 8.924 0.0124 *
# Residuals 11 0.1539 0.01399
cor.test( EBCcombined$PB_freq_mat1P, EBCcombined$IL_freq_mat1P)
#t = 13.916, df = 18, p-value = 4.49e-11
#95 percent confidence interval:
# 0.8912823 0.9829755
#sample estimates:
# cor
#0.9565352
F2b <-
EBCcombined %>%
ggplot(aes(PB_freq_mat1P, IL_freq_mat1P, size = allReadsMat1 ) ) +
xlim( 0,1 ) +
ylim( 0,1 ) +
xlab(expression( "PacBio - Frequency Plus at "*italic("mat1" ) ) ) +
ylab(expression( "Illumina - Frequency Plus at "*italic("mat1" ) ) ) +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method=lm, se=FALSE, size = 1 ) +
geom_smooth(
aes(PB_freq_mat1P, IL_freq_mat1P),
method=lm, se=FALSE, size = 1,
data = EBCcombined %>% filter( PB_freq_mat1P != 0, PB_freq_mat1P != 1),
color = 'red', linetype="longdash") +
geom_point() +
guides(size=guide_legend(title="Number\nIllumina\nreads")) +
theme_bw()
F2b
ggsave("../Manuscript/Fig_2b.pdf" ,plot = F2b, units = "cm", width = 13, height = 10)
The correlations are significant, when looking only at the homothallic, and especially when taking all data into account.
image-20220624184526215
We can further test how well the matches of the reads at P and M are at the rest of the genome.
PBall <- EBCcombined %>%
ggplot(aes(PB_freq_allP, PB_freq_mat1P, size = allReads ) ) +
geom_point() +
xlim( 0,1 ) +
ylim( 0,1 ) +
ggtitle(expression( "PacBio all loci vs PacBio "*italic("mat1") ) ) +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method=lm, se=FALSE, size = 1) +
theme_bw()
PB_Ill <- EBCcombined %>%
ggplot(aes(PB_freq_allP, IL_freq_allP, size = allReads ) ) +
geom_point() +
xlim( 0,1 ) +
ylim( 0,1 ) +
ggtitle(expression( "PacBio all loci vs Illumina all loci") ) +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method=lm, se=FALSE, size = 1) +
theme_bw()
ggarrange(PBall, PB_Ill, ncol=2, nrow=1, common.legend = TRUE, legend="bottom")
image-20220624184628219
Where the frequencies for ‘all’ for Illumina and PacBio are very similar, the frequencies for all and mat1 within PacBio are showing a similar pattern to that for the same association within Illumina; somewhat skewed to more P for ‘all’, but large range for P at mat1.
We took images of slides for each of the 57 natural isolates and manually counted the total number of asci, zygotes, cells and spores on 11 October 2019. These can now be analyzed to test how mating efficiency is associated with mating type ratios.
Additionally, there is the same information for a subset from these samples from November 2021.
# Obtain data from 2019 and 2021
# total for 2019 data needs to be calculated, for 2021 has already been done
mating19 <- read_csv("data/191011_MANUAL_counting.csv") %>%
select(-strain,-img, -Well, -Comment) %>%
mutate( total = (Asci * 2 + # asci count double
Cells + # cells count normal
Spores * 0.5 + # spores count half
Zygotes * 2 ) ) # zygotes count double
# total contains
mating21 <- read_csv("data/2111_MANUAL_counting.csv")
## make the two sets compatible and long
names(mating19) <- c("spores", "asci", "cells", "zygotes","strain", "total")
mating19 <- mating19 %>% pivot_longer(spores:zygotes, names_to = "type", values_to = "n")
# combine
matings <- rbind(mating19, mating21)
matings <-
matings %>% group_by(strain, type) %>%
summarise(total = sum(total), n = sum(n)) %>%
mutate( fraction =
ifelse( type == "cells",
n/total, # cells count once
ifelse(type == "spores",
0.5* n / total, # spores count half
2 *n / total ) # zygotes and asci cound double
)
)
## plot the fraction of mated vs unmated cells
matings %>%
mutate(mated = ifelse( type == "cells", "unmated", "mated"),
strain = as.factor(strain)) %>%
group_by(strain, mated) %>%
summarize( fraction_mated = sum(fraction )) %>%
ungroup() %>%
mutate( orderVar = ifelse(mated == "unmated", fraction_mated, 0)) %>%
mutate( strain = fct_reorder( strain, -1*orderVar, min) ) %>%
ggplot( aes(x = strain, y = fraction_mated, fill = mated) )+
geom_bar( stat = "identity") +
xlab("Strain") +
ylab("Fraction cells mated") +
scale_y_continuous(limits = c(0,1), expand = c(0, 0)) +
theme_bw() +
theme( axis.text.x=element_text(angle=60,hjust=1))
image-20220624185243959
This data is now combined with the read counts for each strain from the Illumina data. We calculate the correlation and also see if there is a difference between the groups that do, or do not have a significant deviation. Do test the difference, we remove all clearly heterothallic strains (only when the minor allele is at least at 5%)
conversionTable <- read_csv("data/references/jeffares_toEBC_table.csv")
combined2 <-
inner_join( inner_join( ill_chi, conversionTable ),
matings %>%
mutate(mated = ifelse( type == "cells", "unmated", "mated"),
EBCstrain = as.factor(strain)) %>%
group_by(EBCstrain, mated) %>%
summarize( fraction_mated = sum(fraction )) %>%
ungroup() %>%
filter(mated == "mated") %>%
select(-mated)
)
#### Figure 3c ####
F3cmain <-
combined2 %>%
mutate( Deviate = as.factor(
ifelse(fdrtest, "Symmetrical", "Deviating" ) )
) %>%
# calculate the minor mating type frequency
mutate(asymmetry = -1*abs((freq_mat1P_c)-0.5) +0.5 ) %>%
ggplot(aes(y = fraction_mated, x = asymmetry, color = Deviate)) +
geom_point( aes( size = mat1_reads_c), alpha = 0.8 ) +
theme_bw() +
xlim(0,0.5) +
ylim(0,1) +
ylab("Fraction of the cells that mated") +
xlab("Minor mating type frequency ")
F3cInset <-
combined2 %>%
mutate( deviate = as.factor(
ifelse(fdrtest, "Symmetrical", "Deviating" )
)
) %>%
mutate(asymmetry = -1*abs((freq_mat1P_c)-0.5) +0.5 ) %>%
filter( asymmetry > 0.05 ) %>%
ggplot(aes(y = fraction_mated, x = deviate, fill = deviate) ) +
geom_boxplot() +
geom_jitter( width = 0.1, alpha = 0.3, size = 0.8 ) +
theme_bw() +
ylim(0,1) +
ylab("Fraction of the cells that mated") +
xlab("Minor mating type frequency ") +
theme(legend.position="none",
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text( size = 6),
axis.text.y = element_text( size = 6),
plot.background = element_blank())
F3cmain + F3cInset
mod <- glm( fraction_mated ~ deviate, data = combined2 %>%
mutate( deviate = as.factor(
ifelse(fdrtest, "Symmetrical", "Deviating" )
)
) %>%
mutate(asymmetry = -1*abs((freq_mat1P_c)-0.5) +0.5 ) %>%
filter( asymmetry > 0.05 ),
family = binomial(),
weights = mat1_reads_c)
summary(mod)
#Deviance Residuals:
# Min 1Q Median 3Q Max
#-16.393 -3.348 1.045 2.693 11.886
#
#Coefficients:
# Estimate Std. Error z value Pr(>|z|)
#(Intercept) -0.35518 0.03790 -9.371 < 2e-16 ***
#deviateSymmetrical 0.44580 0.06277 7.102 1.23e-12 ***
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#(Dispersion parameter for binomial family taken to be 1)
# Null deviance: 1603.9 on 60 degrees of freedom
#Residual deviance: 1553.3 on 59 degrees of freedom
#AIC: 1786.8
#Number of Fisher Scoring iterations: 4
image-20220624185645645
The two groups differ from each other, suggesting that mating is more efficient closer to 50:50 ratio of the mating types, however the spread is large in both.
Outcrossing was measured by crossing a subset of 16 natural isolate to two different strains with markers, of which the spores were then plated out and scored for presence of markers on chromosomes I or II. The highest value was added in the file and here combined with the data from the Illumina read analysis and plotted.
outcross <- read_csv("data/211103_outcrossing.csv", )
outcross <-
read_csv("data/220120_outcrossing_clean2.csv" ) %>%
rename(freq_outcr = Freq )
combined_outcross <-
inner_join( combined,
rename(outcross, EBCstrain = strain ) %>%
group_by(EBCstrain,Ref) %>%
summarize(
allGreen =
sum(Green, na.rm = TRUE),
total =
sum(Total, na.rm = TRUE),
freq_outcr =
mean(freq_outcr * 2, na.rm = TRUE))
)
#### Fig 4c ####
F4c <-
combined_outcross %>%
filter( mated == "mated") %>%
group_by(strain, EBCstrain, Ref) %>%
summarize( freq_outcr = mean(freq_outcr),
freq_mat1P = mean(freq_mat1P),
total = sum(total)
) %>%
filter( total >20 ) %>% # remove strains with too few observations.
ggplot(aes(y = freq_outcr, x = -1*abs((freq_mat1P)-0.5) +0.5 , fill = freq_mat1P ) ) +
geom_point( aes(size = total), colour="black", pch=21 ) +
# geom_smooth(method = "lm", se = F) +
theme_bw() +
facet_grid(Ref ~.) +
scale_fill_distiller(type = "seq",
direction = -1,
palette = "Greys") +
# geom_text(aes( label=strain ), hjust=0, vjust=0) +
xlim(0,0.5) +
ylim(0,1) +
ylab("Outcrossing fraction") +
xlab("Minor mating type frequency") +
theme( axis.text.x=element_text(angle=60,hjust=1))
F4c <- annotate_figure(F4c, fig.lab = "C", fig.lab.face = "bold")
F4c
image-20220628105058847
The reviewers suggested to increase the number of measurements and
plot the results differently. So that is what was done. For 57 strains
the measurements were performed resulting in the raw data given in
230222_outcrossing_all57_full.csv
outcross_57 <- read_csv("data/230222_outcrossing_all57_full.csv", ) %>%
mutate( freq_outcr = 2 * survive / total ) %>%
rename( EBCstrain = strain )
combined_outcross_57 <-
inner_join( combined,
outcross_57 ) %>%
filter( mated == "mated")
## add noise to P values that are 0 or 1
## make wide and group because there are multuple rows with mean P at mat1
combined_outcross_57 <-
combined_outcross_57 %>%
group_by(strain, EBCstrain, Ref ) %>%
summarize( freq_outcr = mean(freq_outcr),
freq_mat1P = mean(freq_mat1P)
) %>% ungroup() %>%
select( Ref, strain, EBCstrain, freq_outcr, freq_mat1P ) %>%
pivot_wider( names_from = Ref, values_from = freq_outcr )
combined_outcross_57$freq_mat1P <-
combined_outcross_57$freq_mat1P -
(combined_outcross_57$freq_mat1P < 0.02) * runif(nrow(combined_outcross_57)) * 0.05
combined_outcross_57$freq_mat1P <-
combined_outcross_57$freq_mat1P +
(combined_outcross_57$freq_mat1P > 0.98) * runif(nrow(combined_outcross_57)) * 0.05
## and make long again
combined_outcross_57 <-
combined_outcross_57 %>%
pivot_longer(names_to = "Ref", values_to = "freq_outcr", cols = Minus:Plus)
### plot
F4c_57 <-
combined_outcross_57 %>%
ggplot(aes(y = freq_outcr, x = freq_mat1P ) ) +
geom_smooth( aes( color = Ref), method = "lm", se = F) +
geom_line( aes( group = EBCstrain ) ) +
geom_point( aes( fill = Ref ), pch=21, size = 2.5) +
theme_bw() +
ylab("Outcrossing fraction") +
xlab("Frequency P at mat1") +
ylim(0,1.35) +
theme( strip.background = element_rect( fill = "white"),
axis.text = element_text( size = 6, hjust = 0.5 ),
axis.title = element_text( size = 8, hjust = 0.5 ),
plot.title = element_text( size = 7, hjust = 0.5 ),
legend.text = element_text(size = 5, colour = "black", angle = 0),
legend.title = element_text(size = 5, colour = "black", angle = 0)) +
theme( plot.margin=margin(0, 1.1, 0, 0, "cm") ) ## add to make all figure about same size
F4c_57
F4c_57 <- annotate_figure(F4c_57, fig.lab = "C", fig.lab.face = "bold")
image-20230315114019745
combined_outcross_57 %>%
mutate( thallism = ifelse( freq_mat1P > 0.05 & freq_mat1P < 0.95,
"homothallic",
ifelse( freq_mat1P >= 0.95, "Plus", "Minus") ) ) %>%
group_by( Ref, thallism ) %>%
summarize(mean = mean(freq_outcr, na.rm = T),
median = median(freq_outcr, na.rm = T) )
# Groups: Ref [2]
# Ref thallism mean median
# <chr> <chr> <dbl> <dbl>
#1 Minus homothallic 0.218 0.124
#2 Minus Minus 0 0
#3 Minus Plus 0.977 1.04
#4 Plus homothallic 0.366 0.267
#5 Plus Minus 1.01 0.959
#6 Plus Plus 0 0
F4d_57 <-
combined_outcross_57 %>%
mutate( thallism = ifelse( freq_mat1P > 0.05 & freq_mat1P < 0.95,
"Homothallic",
ifelse( freq_mat1P >= 0.95, "Plus", "Minus") ) ) %>%
mutate( thallism = fct_relevel( thallism, "Minus", "Homothallic", "Plus") ) %>%
ggplot( aes( fill = Ref, x = thallism, y = freq_outcr ) ) +
geom_boxplot() +
theme_bw() +
ylim( 0, 1.35 )+
ylab("Outcrossing fraction") +
xlab("Assumed Thallism") +
theme( strip.background = element_rect( fill = "white"),
axis.text = element_text( size = 6, hjust = 0.5 ),
axis.title = element_text( size = 8, hjust = 0.5 ),
plot.title = element_text( size = 7, hjust = 0.5 ),
legend.text = element_text(size = 5, colour = "black", angle = 0),
legend.title = element_text(size = 5, colour = "black", angle = 0)) +
theme( plot.margin=margin( 0, 1.05, 0, 0, "cm") ) ## add to make all figure about same size
F4d_57 <- annotate_figure(F4d_57, fig.lab = "D", fig.lab.face = "bold")
image-20230315114150605
Run on LRZ bio_HPC
module load vcftools/0.1.14-gcc8
module load plink/1.07-gcc8
DIRIN="/dss/dsslegfs01/pr53da/pr53da-dss-0022/nobackup/private/pac_bio/03_denovo_assembly/natural_strains_group1/alignment/08_heritability/ldak.5.94_newdata/"
SCRIPTS="/dss/dsshome1/lxc02/di36fuj/Rstudio/001_switching_asymmetry/scripts/ldak"
cd $DIRIN
vcftools --vcf Schizosaccharomycespombesnps.20121212.filt3c.all.snpsonly.anno3.vcf \
--remove-indv JB1207 \
--remove-indv JB1196 \
--plink --out allsnps.plink
plink --noweb --make-bed --file allsnps.plink --out allsnps
vcftools --vcf ${DIRIN}Schizosaccharomycespombeindels.gatk.union.genotyped.filter.vcf \
--remove-indv JB1207 \
--remove-indv JB1196 \
--plink --out cnv.plink
plink --noweb --make-bed --file cnv.plink --out cnv
vcftools --vcf ${DIRIN}Schizosaccharomycespombe113curatedSVcallsJeffares2016.vcf \
--remove-indv JB1207 \
--remove-indv JB1196 \
--plink --out SV.plink
plink --noweb --make-bed --file SV.plink --out SV
./ldak5.1.linux --calc-kins-direct allsnpskinship --bfile allsnps --ignore-weights YES --power -0.25
./ldak5.1.linux --calc-kins-direct cnvkinship --bfile cnv --ignore-weights YES --power -0.25
./ldak5.1.linux --calc-kins-direct SVkinship --bfile SV --ignore-weights YES --power -0.25
# heritability analisys using allsnps-cnv-invtra
for m in {1..10}; do ${SCRIPTS}/ldak5.1.linux \
--reml output/herit.$m \
--pheno phenorm.and.wine.2016-08-25.txt \
--mgrm mlist-allsnps-cnv-invtraN.txt \
--mpheno $m \
--keep cnvkinship.grm.id ;
done
# heritability analisys only SNPs using the SNP dataset from Jeffares et al. 2015 Nature Microbio
head -1 mlist-allsnps-cnv-invtraN.txt > mlist-allsnpsN.txt
for m in {1..228}; do ./ldak.5.94 --reml output/snpsherit.$m --pheno phenorm.and.wine.2016-08-25.txt --mgrm mlist-allsnpsN.txt --mpheno $m --keep cnvkinship.grm.id ; done
# heritability analysis only ancestral haplotypes using the dataset from Tusso et al. 2019 GBE
echo ancestralhapkinship > mlist-ancestralhap.txt
for m in {1..228}; do ./ldak.5.94 --reml output/anchapherit.$m --pheno phenorm.and.wine.2016-08-25.txt --mgrm mlist-ancestralhap.txt --mpheno $m --keep cnvkinship.grm.id ; done
## using a custom perl script from Dan Jeffares (Jeffares et al 2015) the results are
## reformatted for easier interpretation of the results.
perl ./parse.reml3.pl "output/herit*reml" mlist-allsnps-cnv-invtraN.txt | perl -pne 's/output\/herit\.//g;s/\.reml//g' > her-all.txt
perl ./parse.reml3.pl "output/snpsherit*reml" mlist-allsnpsN.txt | perl -pne 's/output\/snpsherit\.//g;s/\.reml//g' > her-snps.txt
perl ./parse.reml3.pl "output/anchapherit*reml" mlist-ancestralhap.txt | perl -pne 's/output\/anchapherit\.//g;s/\.reml//g' > her-anchap.txt
For the simulations a number of functions were used from the file
scripts/CA_functions_asym.R that were called from the
script as given below.
The input values are given in a separate txt file . For the main analyses, the input file had these values:
# set variables
SIZE = 200 ## size of the grid (SIZE x SIZE) the smaller the faster
GEN = 500 ## maximum number of growth cycles
SEXGEN = 1 ## number of sexual cycles
DENS = c(0.0001,0.0005,0.001,0.005,0.01)
## list of densities of cells at the start of a new sexual cycle
SWTP = seq(0,100,5)/100 ## list of probabilities for P to switch
SWTM = seq(0,100,5)/100 ## list of probabilities for M to switch
SWT = 1 ## frequency of switchers at the start of a run
COST = c(0) ## list of costs of growth reduction for a switching strain
TORUS = FALSE ## How edges are treated (T = torus shape, F = edge is hard edge)
localMating = TRUE ## Local (T) or random (F) mating
REPEATS = c(1:100) ## total number of replicate runs per setting
drawCycleEnd = FALSE
startPop = "density" ## indicates how each sexual generation is started can be one of three:
## "oneCell" - places a single cell on the center of the grid
## "threeCells" - places three cells on a grid
## "density" - randomly places cells to a density of DENS on the grid
allow_selfing = TRUE
outpath="output"
outFile="asymmetric_switching_outcrossing"
These were then run using this script which simply loops over the values for the variables, to change the density, switching frequencies \(P\rightarrow M\) and \(M\rightarrow P\) using 100 replicates for each parameter set. Note that, even though here there are only switchers, the simulations keep track of the frequencies switchers and non-switchers (i.e. homo- and heterothallics), which in these simulations are always 1 and 0 respectively. This is a remnant from the code used in Nieuwenhuis et al. 2018.
#!/usr/bin/env Rscript
#############################################################################################
##
## Simulations of invasion of mating type switching in a single celled organisms in
## a spacial environment.
##
## We start with making an environment that consists of a matrix of size SIZE which
## we populate with an initial population of cells for which the density can vary
## Each gridcell has information on:
## - mating type if occupied
## - if it can switch
##
## Then the function runCA is called in which the cells will grow until then
## entire matrix is filled. During growth there are a few parameters that affect
## growth:
## - fitness of the switcher vs the non-switcher
## - frequency of switching efficiency
##
##
#############################################################################################
getwd()
args = commandArgs(trailingOnly=TRUE)
# test if there is at least one argument: if not, return an error
if (length(args)==0) {
stop("At least one argument must be supplied (input file with
information on variables). See example_vars.txt.",
call.=FALSE)
} else if (length(args)==1) {
# default scripts location
args[2] = "scripts"
}
.libPaths("/dss/dsshome1/lxc02/di36fuj/R/debian10/3.6")
# loading dependencies
source( paste0(args[2],"/CA_functions_asym.R") )
source( args[1] )
## generate the basename to store results and log using date and time
if( !exists("outpath" ) ) outpath = "." ## check if path and basename are given in args[1]
if( !exists("outFile" ) ) outFile = "out"
filebase = file.path( outpath, paste0( gsub("[: -]", "", Sys.Date()),"_",
outFile,"_",
gsub("[: -]", "", substr(Sys.time(), 12, 19))))
## log the settings for this session
fileConn<- file(paste0(filebase, ".out"))
writeLines( readLines(args[1]), fileConn)
close(fileConn)
## this produces a data frame in which the data is collected that will be saved for further analyses.
output<- data.frame(matrix( ncol = 11 ))[0,]
names(output) <- c( "size", "swtP","swtM", "replicate", "cost", "dens","switchers",
"plus", "minus", "matedtotal", "outcrossed")
#### start loops ####
for(repet in REPEATS ){ # loop over repeats
for(dens in DENS ){ # loop over densities
for( swtP in SWTP ) { # loop over P-->M
for(swtM in SWTM ) { # loop over M-->P
## swtM = swtP # to simulate only diagonal, comment out previous line and uncomment this line
for( cost in COST ) { # loop over cost for switching
SW = SWT # set current switchers frequency
for(sexgen in 1:SEXGEN){
########## Make empty matrices
mat<-matrix(0, nrow = SIZE, ncol = SIZE)
swit<-matrix(0, nrow = SIZE, ncol = SIZE)
mated<-matrix(FALSE, nrow = SIZE, ncol = SIZE)
switch( startPop,
"oneCell"={
## We will populate the matrix with one cell, either P or M
mat[round( SIZE / 2), round(SIZE/2)] <- round(runif(1))+1
occup <- mat > 0 ## update occupied matrix
swit[ round( SIZE / 2), round(SIZE/2) ] <-
ifelse(runif(1) < SW, 1, 2 ) ## tell if cell is switcher
},
"threeCells"={
mat[round( SIZE / 4), round(3*SIZE/4)] <- round(runif(1))+1
mat[round( SIZE / 4), round(SIZE/4)] <- round(runif(1))+1
mat[round( (SIZE / 4)/tan(60)), round(SIZE/2)] <- round(runif(1))+1
if(length(SW)==1) SW <- ( runif(3) > SW ) + 1
swit[round( SIZE / 4), round(3*SIZE/4)] <- SW[1]
swit[round( SIZE / 4), round(SIZE/4)] <- SW[2]
swit[round( (SIZE / 4)/tan(60)), round(SIZE/2)] <- SW[3]
},
"density"={
mat[matrix(runif(SIZE^2) > 1-dens, nrow=SIZE, ncol=SIZE)] <- 1 ## set mating type +
mat[matrix(runif(SIZE^2) > 1-dens, nrow=SIZE, ncol=SIZE)] <- 2 ## set mating type -
## if there are no cells, add one to the center
if( sum( mat ) == 0 ) mat[round( SIZE / 2), round(SIZE/2)] <- round(runif(1))+1
occup <- mat > 0 ## tell if is occupied
## indicate if the cell can switch (1) or cannot switch (2)
swit[ occup ] <- (runif( sum( occup ) ) > SW ) + 1
},
stop("Invalid value for 'startPop'")
)
#### run full cycle of growth ####
print(paste0("Density: ",dens,
" Fitness cost: ",cost,
" Switching: P ",swtP,
" - M ", swtM ,
" Repetion: ",repet,
" Transfer: ", sexgen))
ding <-
runCA(mat = mat, swit = swit, switPM = c(swtP,swtM),
SWfit = cost, torus = TORUS, GEN = GEN) ## run one round of growth
mat <- ding[[ 1 ]] ## update mat matrix
swit <- ding[[ 2 ]] ## update swit matrix
ind <- ding[[ 3 ]] ## update individuals matrix
#### mate the population ####
mated<-
runMate( mat, ind = ind, local = T, torus = TORUS, allow_selfing = allow_selfing)
if( repet == 1 & drawCycleEnd ){
jpeg(paste0("figures/outfile","_dens",dens, "_cost", cost, "_swtP",
swtP, "swtM", swtM, "_rep" , repet,'.jpg'),
width = 350, height = 400)
par(mfrow=c(2,2),mar=c(0.5,0.5,2,0.5) )
image( mat, axes = FALSE )
image( swit, axes = FALSE )
image( mated, axes = FALSE )
mtext(paste0( "dens: ",dens, ", cost: ", cost, ", swtP: ", swtP, ", swtM: ", swtM),
side = 3, adj = -1, line = 1, outer = F)
image(ind, col = rainbow(max(ind+1)), axes = FALSE)
dev.off()
}
# Calculate number of Switchers and Non-Switchers
totNS <- sum( swit == 2 ) ## non switchers
totSW <- sum( swit == 1 ) ## switchers
# Calculate number of mated Switchers and Non-Switchers
ns <- sum ( mated > 0 & swit == 2)
sw <- sum ( mated > 0 & swit == 1)
output<-rbind(output,
c(SIZE, repet, swtP, swtM,
cost, dens,max(ind),
sum( mat == 1 ), # mat P (total is sum P + M)
sum( mat == 2 ), # mat M
sum( mated > 0 ), # mated
sum( mated == 2 ), # outcrossed total (selfed = mated - outcrossed)
sw, # total switchers
sum( mated > 0 && # mated switchers (matedNS = mated - matedSW)
swit == 1),
sum( mated == 2 && # outcrossed switchers
swit == 1) # (selfedSW = matedSW - outcrSW)
)
)
SW <- sw /(ns + sw)
} # end sexgen
} #end cost
}
## write the results so far to a file.
names(output) <- c( "size","replicate", "swtP","swtM", "cost", "dens","individuals",
"plus", "minus", "mated", "outcrossed",
"switchers","switchers_mated","switchers_outcrossed")
write.csv(x = output,
file = paste0( filebase, ".csv"),
row.names = F)
} #end swtP
} #end dens
} #end rep
We also simulated what happens if homothallism is possible, but
mating can only occur through outcrossing. For this we ran the code as
before, but with allow_selfing = F and we only measure the
diagonal (\(rate_{P\rightarrow M} =
rate_{M\rightarrow P}\)).
Next to measurement for an entire grid of 200x200, also the
‘colonies’ after only 8 or 16 cell divisions were analyzed. Similar to
above simulations were run, but now with a smaller grid size (to
increase speed) and with the asexual generation time set to 8 or 16. The
setting changed were: GEN = 8 and SIZE = 30
for 8 cell divisions and GEN = 16 and
SIZE = 50 for 16 cell divisions.
Data that was obtained as described above was analyzed in R. The raw data (one row per simulation of a single generation) was read. Each row has information on the switching frequencies, the number of individuals P and M and whether the have mated. First, read in the data from the 3 different data sets (1. full runs, 2. diagonal with only outcrossing and 3. the small colonies).
dir = "D:/Dropbox/Projects/Pombe/NaturalVariationMatingSwitching/Rstudio/"
setwd(dir)
# Import the data for normal simulations
df_sim <- list.files(pattern = "20220420_asymmetric_switching_outcrossing.*.csv",
path = "output", full.names = T) %>%
map_df(~ mutate(read_csv(.), fileRun = .)) %>% as_tibble()
size = df_sim$size[1]
# Import the simulation data when only outcrossing is allowed
only_out <- list.files(pattern = "2022042[01]_asymmetric_only_outcrossing.*.csv",
path = "output", full.names = T) %>%
map_df(~ mutate(read_csv(.), fileRun = .)) %>% as_tibble()
# Import data for 8 or 16 asexual growth cycles (very small colonies)
df_8 <- read_csv("./output/20191121_competition_8divisions.csv", col_select = -1) %>%
filter(size!=0)
df_16 <- read_csv("./output/20191121_competition_16divisions.csv", col_select = -1) %>%
filter(size!=0)
# combined 8 and 16 generations into one dataframe
df8_16 <- rbind(mutate(df_8, pop = "8 Generations"),
mutate(df_16,pop = "16 Generations") )
We will start with plotting the results for each density for all different values:
#### Fig S4 ####
densities<- as.numeric(levels(as.factor(df_sim$dens) ))
matedPlots <- vector('list', length( densities) )
## plot the frequency of the mated cells for all switching frequencies
## fill a list with plots for each density
for(x in 1:length(densities) ){
matedPlots[[ x ]] <-
df_sim %>%
group_by(swtP, swtM, dens ) %>%
summarise( freq = mean( mated/ size^2, na.rm = TRUE ) ,
n = n()) %>% filter(dens == densities[x]) %>%
ggplot(aes(x = swtP, y= swtM, z = freq)) +
geom_raster( aes( fill = freq ) , interpolate = F) +
scale_fill_gradientn(colours=c("#FF0000FF","#FFFFFFFF","#0000FFFF"), limits = c(0,1)) +
ylab("Probability of change when Plus") +
geom_contour() +
ggtitle(paste0("Density = ", densities[x]) ) +
geom_abline() +
scale_x_discrete(expand=c(0,0))+
scale_y_discrete(expand=c(0,0))+
xlab("Probability of change when Minus") +
theme(#axis.line=element_blank(),
legend.position="bottom",
axis.line = element_line(colour = "black"),
strip.background = element_rect(fill = "white"),
axis.text= element_text(size = 3),
axis.ticks=element_blank(),
plot.title = element_text(size = 7, hjust = 0.5 ),
axis.title = element_text(size = 5)
) +
guides(fill = guide_colorbar(barwidth = 20, barheight = 1 )
)
}
tgrob <- text_grob("Frequency of \n mated cells",size = 15)
title1 <- as_ggplot(tgrob) + theme(plot.margin = margin(0,0,0,0, "cm"))
FS4 <- ggarrange(title1, matedPlots[[1]],matedPlots[[2]],matedPlots[[3]],
matedPlots[[4]],matedPlots[[5]],
common.legend = T, legend = "bottom")
FS4 <- annotate_figure(FS4, fig.lab = "Fig. S4", fig.lab.face = "bold")
FS4
image-20220624193225481
As can be seen, there is an increase of mating efficiency at high switching frequencies, but only close to the diagonal. To better see what happens at the diagonal, we will now plot only the values for when switP = switM (which is the diagonal).
## plot diagonal (i.e. when P = M switch frequencies) against the fraction of
## cells that mated
F3a <-
df_sim %>%
group_by(swtP, swtM, dens ) %>%
summarise( Mating = mean( mated/ size^2 ) ,sd_mated = sd( mated / size^2),
Outcrossing = mean(if_else( mated ==0,0,outcrossed/mated ) ),
n = n()) %>%
filter( swtP == swtM) %>%
mutate( Density = as.factor(dens), SwitchingEfficiency= swtP ) %>%
ggplot(aes(x = SwitchingEfficiency, color = Density)) +
ylab("Mating efficiency") +
geom_line(aes( y = Mating ), size = 0.5) +
ylim( 0, 1 ) +
xlab("Switching rate (P = M)") +
theme_bw() +
theme(
strip.background = element_rect( fill = "white"),
axis.text= element_text( size = 6, hjust = 0.5 ),
axis.title = element_text( size = 8, hjust = 0.5 ),
#axis.ticks=element_blank(),
plot.title = element_text( size = 7, hjust = 0.5 )
)
F3a <- annotate_figure(F3a, fig.lab = "A", fig.lab.face = "bold")
F3a
image-20220624193428690
There appears to be little effect of density, but switching rate is improving mating efficiency. From the earlier figures it seems that a change in only one of the two (so away from the diagonal) will reduce efficiency. To capture this, we can plot the value when keeping one switching direction fixed and see what happens with the mating frequency for a change at the other direction. To be able to compare the value between the different switching frequencies, the value on the x axis shows the log of the ratio of \((M\rightarrow P)/(P\rightarrow M)\).
## then plot the curve and the maximum point for different fixed points.
F3b <-
df_sim %>%
group_by(swtP, swtM, dens ) %>%
summarise( freq = mean( mated/ size^2 )) %>%
filter( dens == 0.01) %>%
filter( swtP %in% c(0.1,seq(0.2,1, 0.2) ) ) %>%
mutate( swtMlogrel = log( swtM/swtP ),
swtP = as.factor( swtP ) ) %>%
ggplot(aes(x = swtMlogrel,
y = freq,
linetype = swtP,
color = swtP
) ) +
geom_line( size = 0.5) +
ylim( 0, 1 ) +
geom_point(
data = maxMat,
aes(x = swtMlogrel, y = freq),
size = 1
) +
xlab("log( M->P / P->M )") +
ylab("Mating frequency") +
theme_bw() +
theme(
strip.background = element_rect( fill = "white"),
axis.text= element_text( size = 6, hjust = 0.5 ),
axis.title = element_text( size = 8, hjust = 0.5 ),
#axis.ticks=element_blank(),
plot.title = element_text( size = 7, hjust = 0.5 )
)
F3b <- annotate_figure(F3b, fig.lab = "B", fig.lab.face = "bold")
F3b
image-20220624194244968
To test how switching affects the amount of outcrossing and haploid-selfing, we tracked for each mating if this occurred with a cells derived from the same initially seeded cell or with a cells from a different clone. Here we plot these results as a relative value. The plot shows the fraction of the mated cell how many outcrossed
#### Fig 4a ####
## Plot the values for the fraction of outcrossing under selfing allowed or not
F4a <-
rbind(
# combine the data from the diagonal with selfing
df_sim %>% filter( swtP == swtM ) %>% mutate( `Self-compatibility` = "Also selfing" ),
# and from the simulations when selfing is not possible (only P=M was simulated)
only_out %>% mutate( `Self-compatibility` = "Only outcrossing" )
) %>%
mutate( freq = minus/(plus+minus) ) %>%
group_by(swtP, swtM, dens, `Self-compatibility` ) %>%
summarise( freq = mean( freq, na.rm = TRUE),
mated = mean(outcrossed / size^2, na.rm = TRUE),
n = n()) %>%
mutate( Density = as.factor(dens), SwitchingEfficiency= swtP ) %>%
ggplot(aes(x = SwitchingEfficiency,
y = mated,
color = Density,
linetype = `Self-compatibility`) ) +
ylab("Fraction of cells that outcrossed") +
geom_line( size = 0.5) + # linetype="dashed")+
# ylim( 0, 1 ) +
xlab("Switching rate (P->M = M->P)") +
theme_bw() +
theme(
strip.background = element_rect( fill = "white"),
axis.text= element_text( size = 6, hjust = 0.5 ),
axis.title = element_text( size = 8, hjust = 0.5 ),
#axis.ticks=element_blank(),
plot.title = element_text( size = 7, hjust = 0.5 )
)
F4a <- annotate_figure(F4a, fig.lab = "A", fig.lab.face = "bold")
F4a
ggsave("../Manuscript/Fig_4a_outcrossing.pdf",F4a, width = 8.5, height = 6, units = "cm")
image-20220628183405523
Outcrossing is increased by increased switching, but only under intra-haploid incompatibility. When intra-haploid mating is possible, the fraction of cells that outcrossed is reduced. Looking at the relative number of cells that mate through outcrossing (i.e. the fraction of mated cells that reproduce through outcrossing) the reduction is even starker.
#### Fig 4b ####
## plot diagonal (i.e. when P = M switch frequencies) for the fraction of
## outcrossed spores vs haploid-selfed spores
F4b <-
df_sim %>%
group_by(swtP, swtM, dens ) %>%
summarise( Mating = mean( mated/ size^2 ) ,sd_mated = sd( mated / size^2),
Outcrossing = mean(if_else( mated ==0,1,outcrossed/mated ) ),
outcr.sd = sd(outcrossed/mated), n = n()) %>%
filter( swtP == swtM) %>%
mutate( Density = as.factor(dens), SwitchingEfficiency= swtP ) %>%
ggplot(aes(x = SwitchingEfficiency, color = Density)) +
ylab("Fraction of matings through outcrossing") +
ylim( 0, 1 ) +
geom_line(aes( y = Outcrossing), size = 0.5 ) +
theme_bw() +
xlab("Switching efficiency (P = M)") +
theme_bw() +
theme(
strip.background = element_rect( fill = "white"),
axis.text= element_text( size = 6, hjust = 0.5 ),
axis.title = element_text( size = 8, hjust = 0.5 ),
#axis.ticks=element_blank(),
plot.title = element_text( size = 7, hjust = 0.5 )
)
F4b
F4b <- annotate_figure(F4b, fig.lab = "B", fig.lab.face = "bold")
ggsave("../Manuscript/Fig_4b.pdf",F4b, width = 8.5, height = 6, units = "cm")
image-20220628184018399
The efficiency of mating is almost completely driven by the dynamics within a clone as can be seen from these simulations using only 8 or 16 cell divisions. Though more noisy, the overall pattern is highly similar.
#### Fig S3 ####
smallTheme <-
theme(#axis.line=element_blank(),
legend.position="bottom",
axis.line = element_line(colour = "black"),
axis.title = element_text(size = 8, colour = "black", angle = 0),
legend.text = element_text(size = 7, colour = "black", angle = 0),
legend.title = element_text(size = 8, colour = "black", angle = 0),
panel.border = element_rect(colour = "black", fill=NA, size=1),
strip.background = element_rect(fill = "white"),
strip.text.x = element_text(size = 6, colour = "black", angle = 0),
plot.title = element_text(hjust = 0.5))
mated8_16 <-
df8_16 %>%
group_by( pop, SWTP, SWTM ) %>%
summarise( freq = mean( switch_mated/(plus+minus)) ,St.Dev = sd( switch_mated /(plus+minus)) ) %>%
ggplot(aes(x = SWTP, y= SWTM, z = freq)) +
geom_raster( aes( fill = freq ) , interpolate = FALSE) +
#geom_point( aes( size = St.Dev ) ) +
scale_fill_gradientn(colours=c("#FF0000FF","#FFFFFFFF","#0000FFFF"), limits = c(0,1)) +
ylab("Probability of change when Plus") +
geom_contour() +
geom_abline() +
ggtitle( "Frequency of cells mated") +
facet_grid( pop ~ .) +
scale_y_discrete(breaks= c(1:100)) +
scale_x_discrete(breaks= c(1:100)) +
xlab("Probability of change when Minus") +
smallTheme +
guides(fill = guide_colorbar(barwidth = 10, barheight = 1 )
)
ratio8_16 <-
df8_16 %>%
group_by(pop, SWTP, SWTM ) %>%
summarise( freq = mean( minus/(plus+minus))) %>%
mutate( freq = 0.5- abs(0.5-freq ) ) %>%
ggplot(aes(x = SWTP, y= SWTM, z = freq)) +
geom_raster( aes( fill = freq ) , interpolate = FALSE) +
#geom_point( aes( size = St.Dev ) ) +
scale_fill_gradientn(colours=c("#FF0000FF","#FFFFFFFF","#0000FFFF"), limits = c(0,0.5)) +
ylab("Probability of change when Plus") +
geom_contour() +
geom_abline() +
facet_grid( pop ~ .) +
ggtitle( "Proportion mating type") +
scale_y_discrete(breaks= c(1:100)) +
scale_x_discrete(breaks= c(1:100)) +
xlab("Probability of change when Minus") +
smallTheme +
guides(fill = guide_colorbar(barwidth = 10, barheight = 1 )
)
combinedFS4 <-
(mated8_16 + ratio8_16) +
#plot_layout(guides = "collect") &
theme(legend.position = 'bottom') + plot_annotation(title = "Fig. S3", tag_levels = 'A')
combinedFS4
ggsave("../manuscript/Fig_S3_v2.pdf", combinedFS4, width = 16, height = 14.5, units = "cm")
image-20220628184354665
These are the functions called in the simulations as described above.
## Functions for cellular automata simulations
## Written by XXXX
## Latest update: 29.10.2021
#### Resample ####
## This function solves a problem with the 'sample' function from R. sample returns when used with a
## single value a vector of the size of that value, but when used with a vector it returns a new vector of same size
## with the items in the vector in random order. This function only uses the latter function, even when used with
## a vector of a single value.
resample <- function(x, ...) {
x[sample.int(length(x), ...)]
}
#### asexual growth of colonies ####
## This function runs over a matrix performing growth of the matrix until the matrix is completely full,
## or until GEN cycles of growth are performed. It takes a 'switching' matrix too, which indicates the
## switching ability (change from P to M) and state of each cell. During each round a cell has a chance
## to grow, depending on its fitness which can be set to differ for switchers and non-switchers.
## Additionally, the switching efficiency can be set to differ for the Plus and Minus types.
runCA <- function(mat, ## matrix indicating mating type of the cell
swit, ## matrix indicating switching state of cell
SWfit = 1, ## fitness of switcher
switFreq = 1, ## frequency of switching
switPM = c(1,1),## probability of P switch to M and M switch to P
draw = 0, ## frequency of drawing
drawfinal = F, ## draw the matrix when full?
verbose = F, ## print information during run?
fitscale = 1, ##
indivs = F, ## should bookkeeping be performed for each individual?
GEN = 1000, ## total number of sexual cycles
torus = FALSE ## should the grid be a torus?
){
occup <- mat > 0 ## set all occupied coordinates
SIZE <- ncol(mat) ## get size of grid used
## We'll be tracking for each individual who derived from whom.
ind <- occup
ind[occup] <- sample( sum(occup) )
#make a look-up table to store for each coordinate in the matrix the coordinates of the neighbors
SUBCO <-matrix(0, nrow = SIZE^2, ncol = 8)
for(i in 1:SIZE^2){
SUBCO[i,] <- subcoordinates(i, SIZE, torus)}
## loop for GEN generations
for(j in 1:GEN){
## initiate the start matrices for this generation
fit<- matrix(0, nrow = SIZE, ncol = SIZE) ## make a matrix which will contain the fitness of each cell
neigh <- simecol::eightneighbors( occup ) ## count number of cells occupied around new individuals
grow <- matrix(0, nrow = SIZE, ncol = SIZE) ## reset info on whether a cell has grown this round
matnew <- matrix(0, nrow = SIZE, ncol = SIZE) ## reset matrix that will store the new mating type info
fit[occup] <- 1 ## set fitness for non-switchers to 1
fit[occup & swit == 1 ] <- (1 - SWfit) ## set fitness for switchers to 1-SWfit
fit <- fit * runif(SIZE*SIZE) ## calculate this rounds fitness
# Give some feedback if wanted
if(verbose){print(paste0("generation: ",j,", number of cells: ", sum(occup)))}
## In each generation the loop is as follows:
# - Find all the coordinates that are empty and that have at least one neighbor
# - Randomly sample on of these cells as focal
# - Look at the neighbors and see if any can still grow
# - let one cell grow and let its offspring occupy the empty focal cell if possible
for(i in resample( which ( neigh > 0 & !occup ) ) ){ ## use resample for if only one left
## make five vectors length 8 with info on the cells around the focal to work with
subcoord <- SUBCO[i,] ## get the coordinates of neighbors
subfit <- subswit <- subgrow <- subind <- submat <- rep(0,8) ## make empty matrices
## Get for the neighbors the:
submat [ subcoord > 0 ] <- mat [ subcoord ] ## mating type
subgrow[ subcoord > 0 ] <- grow[ subcoord ] ## did they grow yet?
subswit[ subcoord > 0 ] <- swit[ subcoord ] ## switching status
subfit [ subcoord > 0 ] <- fit [ subcoord ] ## fitness
subind [ subcoord > 0 ] <- ind [ subcoord ] ## individual
if( neigh[i] > sum(subgrow) ){ ## check if any neighbor can still grow
fits <- subfit ## calculate a chance for each potential
## parent celland scale by fitness of each
if( max(fits) >= (0.5 * fitscale)){ ## if any of the cells are fit enough...
p <- which.max( fits ) ## ... pick individual with highest scaled fitness
## and store index in p
grow[subcoord[p]] <- 1 ## ... and tell it grew
ind [i] <- subind [p] ## Daughter will be derived from same initial individual
swit[i] <- subswit[p] ## and be of same switching genotype
if( subswit[p] == 2 ) { ## if non switcher
matnew[i] <- submat[p] ## both cells will be of the same mating type
## and both are non-switchers
}else if( runif(1) <
(switFreq * switPM[submat[p]] )) { ## check if switcher switches which can be
## different for P and M cells
if(runif(1) >= 0.5){ ## parent moves to new cell
matnew[i] <- submat[ p ]
mat[subcoord[ p ]] <- 1 + submat[ p ] %% 2 ## changes 1->2 or 2->1
} else { ## daughter moves to new cell
matnew[i] <- 1 + submat[ p ] %% 2 ## changes 1->2 or 2->1
}
} else { ## Switcher does not switch
matnew[i] <- submat[p] ## both cells will be of the same mating type
}
} ## end if 'any neighbor can still grow'
}
}
mat <- mat + matnew ## update the mat matrix
occup <- mat > 0 ## tell if is occupied
## every 'draw' growth cycles draw figure
if( j %% draw == 0 & draw > 0){
# the next commented-out lines can be used to save
# each 'draw' timepoints to a file for e.g. animated gif
# creating a name for each plot file with leading zeros
#if (j < 10) {name = paste0('000',j,'plot.png')}
#if (j < 100 && j >= 10) {name = paste0('00',j,'plot.png')}
# png(paste("loverviewmated_",j,".png"), width=600,height=600,units="px")
image( ( mat > 0 )*( (swit == 2) *2 + mat ) ,
col = c( "#000000","#FFB000","#FF0000","#0000FF","#009DFF") , axes = FALSE)
#dev.off()
}
if( sum( occup ) == SIZE^2){ ## stop if all locations are filled and produce final figures
if( drawfinal ){
# image(mat, main=paste0("Mating type t=",j), col=c(rainbow(4)))
image(mat + 2 * (swit == 2 ) )
#image(swit, main=paste0("Switching? t=",j), col=c(rainbow(5)))
}
return(list(mat=mat, swit=swit, ind = ind ))
}
}
return( list( mat = mat, swit = swit, ind = ind ) )
}
#### mating ####
## Input a square matrix of integers (0, 1, 2) and match individuals of opposite mating type with each other.
## When local mating is on, the function finds a mate in the direct environment, based on the coordinates
## obtained from the function subcoordinates. Then a random cell is found that is of opposite mating type
## that also did not mate. The two cells are mated by uopdating their status in the matrix 'mated'. When all
## cells have been samples, the 'mated' matrix is returned by the function.
##
## If local mating is FALSE, all cells of the mating type in the minority are mated, and the other mating type
## randomly a subsample is mated in equal amount to the minority mating type.
##
runMate <- function( mat , ind = NULL, localMat = TRUE , torus = FALSE, allow_selfing = T){
## get size of matrix
SIZE <- nrow( mat )
## create an empty matrix to store information on mating
mated <- matrix( FALSE, nrow = SIZE, ncol = SIZE)
if(is.null( ind ) ){ # No tracing of clonal identity to measure outcrossing
if( localMat ){ # mating only takes place with nearest neighbors
for(i in sample( SIZE^2 ) ){ ## loop over randomized order of all cells
if(!mated[ i ] & mat[ i ] != 0){ ## if the focal cell is already mated skip and go to next cell
## get info on the local individuals into two newly creased vectors (submated and submat)
submated <- rep( F , 8 )
submat <- rep( 0 , 8 )
subcoord <- subcoordinates ( i , SIZE, torus ) ## get coordinates of neighbours using function subcoordinates
submat[ subcoord > 0 ] <- mat [ subcoord ] ## get neighbour's mating type
submated[ subcoord > 0 ] <- mated[ subcoord ] ## and see if they mated already
m <- mat[ i ] ## get mating type focal
## find a mate and mate!
if( sum( submat > 0 & submat != m & !submated ) ){ ## continue if there are unmated individuals of opposite mating type
p <- resample( subcoord[ submat > 0 & submat != m & !submated ] ) ## pick only one
mated[ p ] <- mated[ i ] <- T ## set both mated cells to mated.
}
}
}
}
else { ## with random mating, the mating type in the minority all mate, the other mat are randomly mated
if( sum( mat==1 ) > sum(mat==2) ){
mated[ mat == 1 ][sample( sum( mat==1 ), sum(mat==2) )] <- TRUE
mated[ mat == 2 ] <- TRUE
}
else{
mated[ mat == 2 ][sample( sum( mat==2 ), sum(mat==1) )] <- TRUE
mated[ mat == 1 ] <- TRUE
}
}
return( mated )
} else { # document inter and intraclonal mating
if( localMat ){ # mating only takes place with nearest neighbors
if( allow_selfing ){
for(i in sample( SIZE^2 ) ){ ## loop over randomized order of all cells
if(!mated[ i ] & mat[ i ] != 0){ ## if the focal cell is already mated skip and go to next cell
## get info on the local individuals into two newly creased vectors (submated and submat)
submated <- rep( F , 8 )
submat <- subind <- rep( 0 , 8 )
subcoord <- subcoordinates ( i , SIZE, torus ) ## get coordinates of neighbours using function subcoordinates
submat[ subcoord > 0 ] <- mat [ subcoord ] ## get neighbour's mating type
submated[ subcoord > 0 ] <- mated[ subcoord ] ## and see if they mated already
m <- mat[ i ] ## get mating type focal
## find a mate and mate!
if( sum( submat > 0 & submat != m & !submated ) ){
## continue if there are unmated individuals of opposite mating type
p <- resample( subcoord[ submat > 0 & submat != m & !submated ] )[1]
## pick only one
mated[ p ] <- mated[ i ] <- ifelse( ind[p] == ind[i], 1, 2)
## set both mated cells to
## * 1 for selfed or to
## * 2 for outcrossed
}
}
}
} else { # only outcrossing allowed
for(i in sample( SIZE^2 ) ){ ## loop over randomized order of all cells
if(!mated[ i ] & mat[ i ] != 0){ ## if the focal cell is already mated skip and go to next cell
## get info on the local individuals into two newly creased vectors (submated and submat)
submated <- rep( F , 8 )
submat <- subind <- rep( 0 , 8 )
subcoord <- subcoordinates ( i , SIZE, torus ) ## get coordinates of neighbours using function subcoordinates
submat[ subcoord > 0 ] <- mat [ subcoord ] ## get neighbour's mating type
submated[ subcoord > 0 ] <- mated[ subcoord ] ## and see if they mated already
subind[ subcoord > 0 ] <- ind[ subcoord ] ## document individual identity
m <- mat[ i ] ## get mating type focal
## find a mate and mate!
if( sum( submat > 0 & # check if cell present
submat != m & # and if cell is of opposite mating type
subind != ind[ i ] & # and if cell is not same clone
!submated ) ){ # and is not mated yet
## continue if there are unmated individuals of opposite mating type
p <- resample( subcoord[ submat > 0 &
submat != m &
subind != ind[ i ] &
!submated ] )[1]
## pick only one
mated[ p ] <- mated[ i ] <- ifelse( ind[p] == ind[i], 1, 2)
## set both mated cells to
## * 1 for selfed or to
## * 2 for outcrossed
}
}
}
}
}
else { ## with random mating, the mating type in the minority all mate, the other mat are randomly mated
if( sum( mat==1 ) > sum(mat==2) ){
mated[ mat == 1 ][sample( sum( mat==1 ), sum(mat==2) )] <- TRUE
mated[ mat == 2 ] <- TRUE
}
else{
mated[ mat == 2 ][sample( sum( mat==2 ), sum(mat==1) )] <- TRUE
mated[ mat == 1 ] <- TRUE
}
}
return( mated )
}
}
#### subcoordinates ####
## Function requires the width of a square matrix and the location of the focal individual.
## Function returns a vector of size 8 containing the coordinates of the 8 squares directly surrounding the focal.
## If torus is set to TRUE, the function returns the coordinates as if the matrix is
## folded around with top connecting to bottom and left to right.
## Else, the function will remove all values that point to locations outside the matrix and set them to 0.
subcoordinates <- function(i, size , torus = FALSE){
subref <- c((-1:1)-size,-1,1,(-1:1)+size) ## make a vector of 8 values
## that indicates how far to
## transpose for the neighbours.
## This will be used to find the neighbours
subcoord <- subref + i ## get the coordinates of neighbors
if(torus){
if( i %% size == 0 ){subcoord[ subcoord %% size == 1 ] <-
subcoord[ subcoord %% size == 1 ] - size } ## fold top to bottom
if( i %% size == 1 ){subcoord[ subcoord %% size == 0 ] <-
subcoord[ subcoord %% size == 0 ] + size } ## fold bottom to top
subcoord[ subcoord < 1 ] <- size ^ 2 + subcoord[ subcoord < 1 ] ## fold left to right
## (redirect negative values to the last column of the matrix)
subcoord[ subcoord > size^2 ] <- subcoord[ subcoord > size^2 ] - size^2 ## fold right to left
## (redirect larger everything than matrix to first column)
}
else{
subcoord[ subcoord < 1 ] <- 0 ## remove negative neighbors
subcoord[ subcoord > size^2 ] <- 0 ## remove overly positive neighbors
if( i %% size == 0 ){subcoord[subcoord%%size==1]<-0} ## remove bottom row when focal
## on bottom edge
if( i %% size == 1 ){subcoord[subcoord%%size==0]<-0} ## remove top row when focal on
## top edge
}
return(subcoord)
}
#### torufy ####
## Fold a square matrix and generate an extended matrix which implements a
## torus shape. Effectively, the takes function a matrix and adds one or more
## rows on all sides of the matrix, using the top to add to the bottom, the
## bottom to the top, the left to the right and right to the left. The
## total size of the new matrix that is returned will be size of before
## matrix + 2x size of extend.
torufy <- function( squarematrix, extend = 1 ){
size <- nrow( squarematrix ) ## get size of the before matrix
squarematrix <- rbind( squarematrix[(size - extend + 1) : size , ], squarematrix, squarematrix[ 1 : extend , ]) ## add rows before and after
squarematrix <- cbind( squarematrix[ , (size - extend + 1) : size ], squarematrix, squarematrix[ , 1 : extend ]) ## add columns before and after
return( squarematrix )
}