Diff of /scripts/BCR_diversity.Rmd [000000] .. [fb5156]

Switch to unified view

a b/scripts/BCR_diversity.Rmd
1
---
2
title: "BCR analysis"
3
author: "Ming Tang"
4
date: '2023-07-26'
5
output: html_document
6
---
7
8
Relate to Main Figure 4f
9
10
### focus on BCR first 
11
12
we have CD3- and CD3+ subsets. 64 samples each.
13
14
for the CD3+ subset, there are B cells in the samples as well (contamination).
15
for the CD3- subset, there are T cells in the samples as well.
16
17
Let's focus on the CD3- subset first as most of B cells should come from there.
18
```{bash}
19
20
cd /liulab/mtang/projects/hodgkin_lymphoma_cellranger_output/TRUST4_out
21
22
wc -l  ../cd3*
23
  64 ../cd3-_pool_ids.txt
24
  64 ../cd3+_pool_ids.txt
25
 128 total
26
 
27
head ../cd3*
28
==> ../cd3-_pool_ids.txt <==
29
Pool84_2
30
Pool84_4
31
Pool84_6
32
Pool84_8
33
Pool84_18
34
Pool84_20
35
Pool84_22
36
Pool84_24
37
Pool84_12
38
Pool84_10
39
40
==> ../cd3+_pool_ids.txt <==
41
Pool84_1
42
Pool84_3
43
Pool84_5
44
Pool84_7
45
Pool84_17
46
Pool84_19
47
Pool84_21
48
Pool84_23
49
Pool84_11
50
Pool84_9
51
52
```
53
54
55
```{bash}
56
mkdir cd3_minus
57
mkdir cd3_plus
58
59
# move them into seperate folders
60
cat ../cd3+_pool_ids.txt | parallel 'mv {}_TRUST4 cd3_plus'
61
cat ../cd3-_pool_ids.txt | parallel 'mv {}_TRUST4 cd3_minus'
62
```
63
64
I rerun TRUST4 v1.0.1 changed the output format.
65
I was using v0.2. TRUST4 v1.0 added `contig_id`  column etc.
66
67
 perl ../trust-barcoderep-to-10X.pl
68
Usage: ./trust-barcoderep-to-10X.pl trust_barcode_report.tsv 10X_report_prefix
69
70
```{bash}
71
cd /liulab/mtang/projects/hodgkin_lymphoma_cellranger_output/TRUST4_out_v1.0.1/cd3_minus
72
73
```
74
75
`convert2_10x.sh`
76
```{bash}
77
#! /bin/bash
78
79
set -euo pipefail
80
81
prefix=$(echo $1 | cut -d/ -f2)
82
perl ../trust-barcoderep-to-10X.pl $1 $prefix
83
```
84
85
86
```{bash}
87
find . -name "*TRUST4_barcode_report.tsv" | parallel './convert2_10x.sh {}'
88
```
89
90
immunarch is parsing the filename as metadata, **do not use too long filenames with many _**.
91
BCR and TCR result will be in separate files. do not use file path name containing **barcode**.
92
93
94
```{bash}
95
cp *_b.csv /liulab/mtang/projects/hodgkin_lymphoma_scRNAseq/data/cd3_minus_TRUST4_v1.0.1_BCR
96
```
97
98
 python trust-cluster.py Pool84_10_TRUST4/Pool84_10_TRUST4_cdr3.out
99
100
101
#### prefix the tigl_id to the cellbarcode to match the cell barcode in the Seurat object
102
103
```{r}
104
#cd3_minus_annotated@meta.data %>% 
105
#  tibble::rownames_to_column(var="barcode") %>%
106
#  write_tsv(here("data/cd3_minus_annotated_seurat_meta.tsv"))
107
108
109
## this metadata is at cell level
110
cd3_minus_annotated_meta<- read_tsv(here("data/cd3_minus_annotated_seurat_meta.tsv"),
111
                                    col_types = cols(.default = col_character()))
112
113
head(cd3_minus_annotated_meta$barcode)
114
## this metadata is at sample level
115
samples_meta<- cd3_minus_annotated_meta %>%
116
  select(tigl_id:cohort2, group) %>%
117
  distinct(.keep_all = TRUE)
118
119
head(samples_meta)
120
121
TRUST4_outputs<- list.files(here("data/cd3_minus_TRUST4_v1.0.1_BCR"), full.names = TRUE)
122
123
pool_id<- str_replace(TRUST4_outputs, ".+(Pool.+)_TRUST4_b.csv", "\\1")
124
tigl_id<- left_join(tibble::tibble(pool_id = pool_id), samples_meta) %>% pull(tigl_id)
125
names(TRUST4_outputs)<- tigl_id
126
127
TRUST4_outs<- map(TRUST4_outputs, read_csv)
128
129
rename_barcode<- function(df, tigl_id){
130
  df<- df %>%
131
    mutate(barcode = paste(tigl_id, barcode, sep="-")) %>%
132
    mutate(barcode = str_replace(barcode, "-1$", ""))
133
  return(df)
134
}
135
136
TRUST4_outs<- purrr::map2(TRUST4_outs, names(TRUST4_outs), ~rename_barcode(df = .x, tigl_id = .y))
137
138
dir.create(here("data/cd3_minus_TRUST4_v1.0.1_BCR_renamed"))
139
walk2(TRUST4_outs, pool_id, 
140
     ~ write_csv(.x, here(paste0("data/cd3_minus_TRUST4_v1.0.1_BCR_renamed/", .y, "_TRUST4_b.csv"))))
141
```
142
 
143
 
144
145
```{r}
146
library(tidyverse)
147
library(here)
148
cd3_minus_annotated_meta<- read_tsv(here("data/cd3_minus_annotated_seurat_meta.tsv"),
149
                                    col_types = cols(.default = col_character()))
150
head(cd3_minus_annotated_meta$barcode)
151
## this metadata is at sample level
152
samples_meta<- cd3_minus_annotated_meta %>%
153
  select(tigl_id:cohort2, group) %>%
154
  distinct(.keep_all = TRUE)
155
```
156
157
immunarch is very picky about the name of the files. one should not have `barcode` in the path! (took me a long time to figure it out...)
158
159
```{r}
160
library(immunarch)
161
library(here)
162
library(tidyverse)
163
library(PMCMRplus)
164
165
###### do not use immunarch for importing, we want to filter the cells based on scRNAseq data ####
166
# we can try both paired heavy chain and light chain
167
# or only heavy chain using .mode = "single"
168
#cd3_minus_BCR <- repLoad(here("data/cd3_minus_TRUST4_v1.0.1_BCR_renamed"), .mode = "paired")
169
#cd3_minus_BCR <- repLoad(here("data/cd3_minus_TRUST4_v1.0.1_BCR_renamed"), .mode = "single")
170
## subset only the heavy chain
171
#IGH_indx<- str_detect(cd3_minus_BCR$meta$Chain, "IGH")
172
#cd3_minus_BCR$data<- cd3_minus_BCR$data[IGH_indx]
173
174
#cd3_minus_BCR$meta<- cd3_minus_BCR$meta[IGH_indx, ]
175
#map(cd3_minus_BCR$data, ~ filter(.x, str_detect(V.name, "IGKV1-39")))
176
#dplyr::filter(cd3_minus_BCR$data$Pool84_6_TRUST4_b, str_detect(V.name, "IGKV1-39") ) %>% View()
177
#View(cd3_minus_BCR$data$Pool87_8_TRUST4_b)
178
#map(cd3_minus_BCR$data, ~ filter(.x, str_detect(V.name, "IGKV1D-39")))
179
180
181
scBCR_files<- list.files(here("data/cd3_minus_TRUST4_v1.0.1_BCR_renamed"), full.names = TRUE)
182
read_scBCR<- function(file){
183
  scBCR<- read_csv(file)
184
  scBCR<- scBCR %>%
185
    filter(cdr3 != "None")
186
  scBCR$pool_id<- str_replace(basename(file), "_TRUST4_b.csv", "")
187
  scBCR<- left_join(scBCR, samples_meta, by = c("pool_id" = "pool_id"))
188
  return(scBCR)
189
}
190
scBCR_data<- map(scBCR_files, read_scBCR)
191
scBCR_data<- bind_rows(scBCR_data) 
192
193
write_csv(scBCR_data, here("results/figures/scBCR_TRUST4_out_no_filtering_heavy_and_light_chain.csv"))
194
195
## also, only filter the heavy chain and productive clones
196
scBCR_heavy<- scBCR_data %>%
197
  filter(chain =="IGH") %>% 
198
  filter(productive) %>%
199
  distinct(barcode, v_gene, j_gene, cdr3, .keep_all = TRUE)  
200
201
202
```
203
204
205
### remove all the cells without BCR info and retain only cells in the B cell space.
206
207
```{r}
208
cd3_minus_meta<- read_tsv(here('data/cd3_minus_metadata.tsv'), guess_max = 1000000)
209
210
scBCR_heavy<-scBCR_heavy %>%
211
  inner_join(cd3_minus_meta %>% select(cell, clusters_anno_sub), by=c("barcode" = "cell"))
212
213
scBCR_heavy<- scBCR_heavy %>%
214
  mutate(cloneType_heavy = paste0("clonetype_heavy_", group_indices_(., .dots = c('v_gene', 'j_gene', 'cdr3'))))
215
216
scBCR_heavy$group<- factor(scBCR_heavy$group, levels = c("Healthy_donors",
217
                                                                       "New_C1D1",
218
                                                                       "refractory_CR_C1D1",
219
                                                                       "refractory_PR_C1D1",
220
                                                                       "refractory_PD_C1D1",
221
                                                                       "refractory_CR_C4D1",
222
                                                                       "refractory_PR_C4D1",
223
                                                                       "refractory_PD_C4D1"))
224
225
write_csv(scBCR_heavy, here("results/figures/scBCR_productive_heavy_chain_B_cells_only.csv"))
226
```
227
228
229
### diversity
230
231
```{r}
232
library(tcR)
233
## count unique clonotype
234
count_unique_clonotype<- function(df){
235
  df<- df %>%
236
    count(pool_id)
237
  return(df)
238
}
239
## chao1 diversity
240
calculate_diversity<- function(df){
241
  df<- df %>%
242
    group_by(pool_id) %>%
243
    summarize(diversity = chao1(Read.count)[1])
244
  return(df)
245
}
246
247
248
scBCR_heavy_nest<- scBCR_heavy %>% 
249
  group_by(group) %>% 
250
  count(pool_id, cloneType_heavy, name = "Read.count") %>%
251
  nest(-group)
252
253
254
scBCR_unique_clonotype<- scBCR_heavy_nest %>%
255
  mutate(diversity = map(data, count_unique_clonotype)) %>%
256
  select(-data) %>%
257
  unnest(diversity)
258
259
260
scBCR_diversity<- scBCR_heavy_nest %>%
261
  mutate(diversity = map(data, calculate_diversity)) %>%
262
  select(-data) %>%
263
  unnest(diversity)
264
265
```
266
267
268
### plotting
269
270
271
```{r}
272
scBCR_unique_clonotype %>%
273
  ggplot(aes(x=group, y = n)) + 
274
  geom_boxplot(aes(fill = group), outlier.shape = NA) +
275
  geom_point() +
276
  theme_bw(base_size = 14) + 
277
  theme(axis.text.x = element_blank(),
278
        axis.ticks.x = element_blank(),
279
        strip.text = element_text(size = 8)) +
280
  xlab("") +
281
  ylab("unique clonotype") +
282
  ggtitle("BCR heavy unique clonotype")
283
ggsave(here("results/figures/BCR_unique_clonotype_filtered_BCR_heavy_per_sample.pdf"), width =8, height = 6)
284
285
write_csv(scBCR_unique_clonotype,here("results/figures/BCR_unique_clonotype_filtered_BCR_heavy_per_sample.csv"))
286
287
scBCR_diversity %>%
288
  ggplot(aes(x=group, y = diversity)) + 
289
  geom_boxplot(aes(fill = group), outlier.shape = NA) +
290
  geom_point() +
291
  theme_bw(base_size = 14) + 
292
  theme(axis.text.x = element_blank(),
293
        axis.ticks.x = element_blank(),
294
        strip.text = element_text(size = 8)) +
295
  xlab("") +
296
  ylab("chao1 index") +
297
  ggtitle("chao1 BCR heavy diversity")
298
ggsave(here("results/figures/BCR_chao1_filtered_BCR_heavy_per_sample.pdf"), width =8, height = 6)
299
300
write_csv(scBCR_diversity, here("results/figures/BCR_chao1_filtered_BCR_heavy_per_sample.csv"))
301
302
```
303
304
305
```{r}
306
cells_per_sample<- scBCR_heavy %>% 
307
  count(group, pool_id, name = "total_number_cells")
308
309
310
cells_per_sample %>%
311
  ggplot(aes(x=group, y = total_number_cells)) + 
312
  geom_boxplot(aes(fill = group), outlier.shape = NA) +
313
  geom_point() +
314
  theme_bw(base_size = 14) + 
315
  theme(axis.text.x = element_blank(),
316
        axis.ticks.x = element_blank(),
317
        strip.text = element_text(size = 8)) +
318
  xlab("") +
319
  ylab("number of cells") +
320
  ggtitle("total number of cells")
321
ggsave(here("results/figures/BCR_total_number_cells_BCR_heavy_per_sample.pdf"), width =8, height = 6)
322
323
write_csv(cells_per_sample, here("results/figures/BCR_total_number_cells_BCR_heavy_per_sample.csv"))
324
```
325
326