Load libraries

library(tidyverse)

Import data

We are importing the metadata which can be found here:

/data/shared/Rfiles/metadata_sub.csv

We are also importing the two files classification_counts.csv and hostread_counts.csv that you created in yesterdays session. If you haven’t created those files, then we provided these files for you here:

/data/shared/Rfiles/classification_counts.csv

/data/shared/Rfiles/hostread_counts.csv

Make sure you copy this file to your current working directory or specify the correct path to the file when importing.

Import the metadata and check the input file using the head() command

meta<-read_delim(file="metadata_sub.csv", delim="\t")
head(meta)
## # A tibble: 6 x 5
##   Sample   group MouseID  timepoint mocktreat
##   <chr>    <chr> <chr>    <chr>     <chr>    
## 1 Sample3  Yc    5        Pre-Abx   A        
## 2 Sample4  Yc    8        Pre-Abx   B        
## 3 Sample6  Pools old pool Pre-Abx   A        
## 4 Sample13 Yc    3        Pre-Abx   B        
## 5 Sample22 Yp    4        Pre-Abx   A        
## 6 Sample25 Yc    4        Pre-Abx   B

Import the classification counts and host read counts as follows

classified<-read_delim(file="classification_counts.csv", delim=",")
head(classified)
## # A tibble: 2 x 11
##   taxlevel taxid taxa  Sample6.report Sample4.report Sample31.report
##   <chr>    <dbl> <chr>          <dbl>          <dbl>           <dbl>
## 1 U            0 uncl…         630819         743725          735948
## 2 R            1 root          197122         162069          165930
## # … with 5 more variables: Sample30.report <dbl>, Sample3.report <dbl>,
## #   Sample25.report <dbl>, Sample22.report <dbl>, Sample13.report <dbl>
hostreads<-read_delim(file="hostread_counts.csv", delim=",")
head(hostreads)
## # A tibble: 2 x 11
##   taxlevel taxid taxa  Sample6.report Sample4.report Sample31.report
##   <chr>    <dbl> <chr>          <dbl>          <dbl>           <dbl>
## 1 U            0 uncl…         920125         982321          983343
## 2 R            1 root           79875          17679           16657
## # … with 5 more variables: Sample30.report <dbl>, Sample3.report <dbl>,
## #   Sample25.report <dbl>, Sample22.report <dbl>, Sample13.report <dbl>

We see in the files we still have .report in the sample names. In order to be able to merge this data with the metadata we need to remove this string and just keep the sample names.

colnames(classified)<-gsub(".report", "", colnames(classified))
colnames(hostreads)<-gsub(".report", "", colnames(hostreads))

head(classified)
## # A tibble: 2 x 11
##   taxlevel taxid taxa  Sample6 Sample4 Sample31 Sample30 Sample3 Sample25
##   <chr>    <dbl> <chr>   <dbl>   <dbl>    <dbl>    <dbl>   <dbl>    <dbl>
## 1 U            0 uncl…  630819  743725   735948   616746  702254   731879
## 2 R            1 root   197122  162069   165930   263061  213543   181963
## # … with 2 more variables: Sample22 <dbl>, Sample13 <dbl>
head(hostreads)
## # A tibble: 2 x 11
##   taxlevel taxid taxa  Sample6 Sample4 Sample31 Sample30 Sample3 Sample25
##   <chr>    <dbl> <chr>   <dbl>   <dbl>    <dbl>    <dbl>   <dbl>    <dbl>
## 1 U            0 uncl…  920125  982321   983343   944690  989295   972564
## 2 R            1 root    79875   17679    16657    55310   10705    27436
## # … with 2 more variables: Sample22 <dbl>, Sample13 <dbl>

Preprocess data

We want to merge metadata and read counts. To do that we first need to re-arrange our data to make sure that we have the samples as column and not as column names.

classified<- classified %>% 
  select(-c(taxid,taxlevel)) %>% # this removes 2 columns we don't need
  gather(Sample, value,-taxa) %>%  # this gathers the data into columns except for column taxa
  spread(taxa, value) %>% # this spreads out again the data into the required format
  rename("classified"=root) # here we rename the column root into 'classified'
head(classified)
## # A tibble: 6 x 3
##   Sample   classified unclassified
##   <chr>         <dbl>        <dbl>
## 1 Sample13     171695       766525
## 2 Sample22     176953       501215
## 3 Sample25     181963       731879
## 4 Sample3      213543       702254
## 5 Sample30     263061       616746
## 6 Sample31     165930       735948
hostreads<- hostreads %>% 
  select(-c(taxid,taxlevel)) %>% 
  gather(Sample, value,-taxa) %>% 
  spread(taxa, value) %>% 
  rename("host"=root) %>%
  rename("nohost"=unclassified) 
head(hostreads)
## # A tibble: 6 x 3
##   Sample     host nohost
##   <chr>     <dbl>  <dbl>
## 1 Sample13  10160 989840
## 2 Sample22 241620 758380
## 3 Sample25  27436 972564
## 4 Sample3   10705 989295
## 5 Sample30  55310 944690
## 6 Sample31  16657 983343

Join with metadata

Now we are ready to join with the metadata by their common Sample column. We use full_join to do that. Check out the resulting data.

metainfo<-meta %>%
  full_join(classified,by="Sample") %>%
  full_join(hostreads,by="Sample")

head(metainfo)
## # A tibble: 6 x 9
##   Sample group MouseID timepoint mocktreat classified unclassified   host nohost
##   <chr>  <chr> <chr>   <chr>     <chr>          <dbl>        <dbl>  <dbl>  <dbl>
## 1 Sampl… Yc    5       Pre-Abx   A             213543       702254  10705 989295
## 2 Sampl… Yc    8       Pre-Abx   B             162069       743725  17679 982321
## 3 Sampl… Pools old po… Pre-Abx   A             197122       630819  79875 920125
## 4 Sampl… Yc    3       Pre-Abx   B             171695       766525  10160 989840
## 5 Sampl… Yp    4       Pre-Abx   A             176953       501215 241620 758380
## 6 Sampl… Yc    4       Pre-Abx   B             181963       731879  27436 972564

Let’s plot the data

First we check how many host reads we had in the beginning

# gather the columns to make them suitable with ggplot
metainfo_gg1 <- metainfo %>% 
  gather(key="host",value="counts",-c(Sample, group, MouseID, timepoint, mocktreat, classified, unclassified))
head(metainfo_gg1)
## # A tibble: 6 x 9
##   Sample  group MouseID timepoint mocktreat classified unclassified host  counts
##   <chr>   <chr> <chr>   <chr>     <chr>          <dbl>        <dbl> <chr>  <dbl>
## 1 Sample3 Yc    5       Pre-Abx   A             213543       702254 host   10705
## 2 Sample4 Yc    8       Pre-Abx   B             162069       743725 host   17679
## 3 Sample6 Pools old po… Pre-Abx   A             197122       630819 host   79875
## 4 Sample… Yc    3       Pre-Abx   B             171695       766525 host   10160
## 5 Sample… Yp    4       Pre-Abx   A             176953       501215 host  241620
## 6 Sample… Yc    4       Pre-Abx   B             181963       731879 host   27436

Now create a stacked bar plot

ggplot(metainfo_gg1, aes(x=Sample, y=counts)) +
  geom_bar(stat="identity",position="stack", aes(fill=host)) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  facet_grid(. ~ mocktreat, scales="free_x",space = "free_x")

Interesting, Sample22 seems to have had much more host than the others.

Now let’s take a look how many reads of the decontaminated data were actually classified by your kraken2 run.

metainfo_gg2 <- metainfo %>% 
  gather(key="classified",value="counts",-c(Sample, group, MouseID, timepoint, mocktreat, host, nohost))
head(metainfo_gg2)
## # A tibble: 6 x 9
##   Sample   group MouseID  timepoint mocktreat   host nohost classified counts
##   <chr>    <chr> <chr>    <chr>     <chr>      <dbl>  <dbl> <chr>       <dbl>
## 1 Sample3  Yc    5        Pre-Abx   A          10705 989295 classified 213543
## 2 Sample4  Yc    8        Pre-Abx   B          17679 982321 classified 162069
## 3 Sample6  Pools old pool Pre-Abx   A          79875 920125 classified 197122
## 4 Sample13 Yc    3        Pre-Abx   B          10160 989840 classified 171695
## 5 Sample22 Yp    4        Pre-Abx   A         241620 758380 classified 176953
## 6 Sample25 Yc    4        Pre-Abx   B          27436 972564 classified 181963
ggplot(metainfo_gg2, aes(x=Sample, y=counts)) +
  geom_bar(stat="identity",position="stack", aes(fill=classified)) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  facet_grid(. ~ mocktreat, scales="free_x",space = "free_x")

Let’s now plot host reads, classified fraction and unclassified fraction into the same plot.

metainfo_gg3 <- metainfo %>% 
  gather(key="reads",value="counts",-c(Sample, group, MouseID, timepoint, mocktreat, nohost))
head(metainfo_gg3)
## # A tibble: 6 x 8
##   Sample   group MouseID  timepoint mocktreat nohost reads      counts
##   <chr>    <chr> <chr>    <chr>     <chr>      <dbl> <chr>       <dbl>
## 1 Sample3  Yc    5        Pre-Abx   A         989295 classified 213543
## 2 Sample4  Yc    8        Pre-Abx   B         982321 classified 162069
## 3 Sample6  Pools old pool Pre-Abx   A         920125 classified 197122
## 4 Sample13 Yc    3        Pre-Abx   B         989840 classified 171695
## 5 Sample22 Yp    4        Pre-Abx   A         758380 classified 176953
## 6 Sample25 Yc    4        Pre-Abx   B         972564 classified 181963
ggplot(metainfo_gg3, aes(x=Sample, y=counts)) +
  geom_bar(stat="identity",position="stack", aes(fill=reads)) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  facet_grid(. ~ mocktreat, scales="free_x",space = "free_x")

In general we can see that the number of reads classified by kraken2 were pretty similar across the samples.