|
a |
|
b/scripts/BCR_diversity.Rmd |
|
|
1 |
--- |
|
|
2 |
title: "BCR analysis" |
|
|
3 |
author: "Ming Tang" |
|
|
4 |
date: '2023-07-26' |
|
|
5 |
output: html_document |
|
|
6 |
--- |
|
|
7 |
|
|
|
8 |
Relate to Main Figure 4f |
|
|
9 |
|
|
|
10 |
### focus on BCR first |
|
|
11 |
|
|
|
12 |
we have CD3- and CD3+ subsets. 64 samples each. |
|
|
13 |
|
|
|
14 |
for the CD3+ subset, there are B cells in the samples as well (contamination). |
|
|
15 |
for the CD3- subset, there are T cells in the samples as well. |
|
|
16 |
|
|
|
17 |
Let's focus on the CD3- subset first as most of B cells should come from there. |
|
|
18 |
```{bash} |
|
|
19 |
|
|
|
20 |
cd /liulab/mtang/projects/hodgkin_lymphoma_cellranger_output/TRUST4_out |
|
|
21 |
|
|
|
22 |
wc -l ../cd3* |
|
|
23 |
64 ../cd3-_pool_ids.txt |
|
|
24 |
64 ../cd3+_pool_ids.txt |
|
|
25 |
128 total |
|
|
26 |
|
|
|
27 |
head ../cd3* |
|
|
28 |
==> ../cd3-_pool_ids.txt <== |
|
|
29 |
Pool84_2 |
|
|
30 |
Pool84_4 |
|
|
31 |
Pool84_6 |
|
|
32 |
Pool84_8 |
|
|
33 |
Pool84_18 |
|
|
34 |
Pool84_20 |
|
|
35 |
Pool84_22 |
|
|
36 |
Pool84_24 |
|
|
37 |
Pool84_12 |
|
|
38 |
Pool84_10 |
|
|
39 |
|
|
|
40 |
==> ../cd3+_pool_ids.txt <== |
|
|
41 |
Pool84_1 |
|
|
42 |
Pool84_3 |
|
|
43 |
Pool84_5 |
|
|
44 |
Pool84_7 |
|
|
45 |
Pool84_17 |
|
|
46 |
Pool84_19 |
|
|
47 |
Pool84_21 |
|
|
48 |
Pool84_23 |
|
|
49 |
Pool84_11 |
|
|
50 |
Pool84_9 |
|
|
51 |
|
|
|
52 |
``` |
|
|
53 |
|
|
|
54 |
|
|
|
55 |
```{bash} |
|
|
56 |
mkdir cd3_minus |
|
|
57 |
mkdir cd3_plus |
|
|
58 |
|
|
|
59 |
# move them into seperate folders |
|
|
60 |
cat ../cd3+_pool_ids.txt | parallel 'mv {}_TRUST4 cd3_plus' |
|
|
61 |
cat ../cd3-_pool_ids.txt | parallel 'mv {}_TRUST4 cd3_minus' |
|
|
62 |
``` |
|
|
63 |
|
|
|
64 |
I rerun TRUST4 v1.0.1 changed the output format. |
|
|
65 |
I was using v0.2. TRUST4 v1.0 added `contig_id` column etc. |
|
|
66 |
|
|
|
67 |
perl ../trust-barcoderep-to-10X.pl |
|
|
68 |
Usage: ./trust-barcoderep-to-10X.pl trust_barcode_report.tsv 10X_report_prefix |
|
|
69 |
|
|
|
70 |
```{bash} |
|
|
71 |
cd /liulab/mtang/projects/hodgkin_lymphoma_cellranger_output/TRUST4_out_v1.0.1/cd3_minus |
|
|
72 |
|
|
|
73 |
``` |
|
|
74 |
|
|
|
75 |
`convert2_10x.sh` |
|
|
76 |
```{bash} |
|
|
77 |
#! /bin/bash |
|
|
78 |
|
|
|
79 |
set -euo pipefail |
|
|
80 |
|
|
|
81 |
prefix=$(echo $1 | cut -d/ -f2) |
|
|
82 |
perl ../trust-barcoderep-to-10X.pl $1 $prefix |
|
|
83 |
``` |
|
|
84 |
|
|
|
85 |
|
|
|
86 |
```{bash} |
|
|
87 |
find . -name "*TRUST4_barcode_report.tsv" | parallel './convert2_10x.sh {}' |
|
|
88 |
``` |
|
|
89 |
|
|
|
90 |
immunarch is parsing the filename as metadata, **do not use too long filenames with many _**. |
|
|
91 |
BCR and TCR result will be in separate files. do not use file path name containing **barcode**. |
|
|
92 |
|
|
|
93 |
|
|
|
94 |
```{bash} |
|
|
95 |
cp *_b.csv /liulab/mtang/projects/hodgkin_lymphoma_scRNAseq/data/cd3_minus_TRUST4_v1.0.1_BCR |
|
|
96 |
``` |
|
|
97 |
|
|
|
98 |
python trust-cluster.py Pool84_10_TRUST4/Pool84_10_TRUST4_cdr3.out |
|
|
99 |
|
|
|
100 |
|
|
|
101 |
#### prefix the tigl_id to the cellbarcode to match the cell barcode in the Seurat object |
|
|
102 |
|
|
|
103 |
```{r} |
|
|
104 |
#cd3_minus_annotated@meta.data %>% |
|
|
105 |
# tibble::rownames_to_column(var="barcode") %>% |
|
|
106 |
# write_tsv(here("data/cd3_minus_annotated_seurat_meta.tsv")) |
|
|
107 |
|
|
|
108 |
|
|
|
109 |
## this metadata is at cell level |
|
|
110 |
cd3_minus_annotated_meta<- read_tsv(here("data/cd3_minus_annotated_seurat_meta.tsv"), |
|
|
111 |
col_types = cols(.default = col_character())) |
|
|
112 |
|
|
|
113 |
head(cd3_minus_annotated_meta$barcode) |
|
|
114 |
## this metadata is at sample level |
|
|
115 |
samples_meta<- cd3_minus_annotated_meta %>% |
|
|
116 |
select(tigl_id:cohort2, group) %>% |
|
|
117 |
distinct(.keep_all = TRUE) |
|
|
118 |
|
|
|
119 |
head(samples_meta) |
|
|
120 |
|
|
|
121 |
TRUST4_outputs<- list.files(here("data/cd3_minus_TRUST4_v1.0.1_BCR"), full.names = TRUE) |
|
|
122 |
|
|
|
123 |
pool_id<- str_replace(TRUST4_outputs, ".+(Pool.+)_TRUST4_b.csv", "\\1") |
|
|
124 |
tigl_id<- left_join(tibble::tibble(pool_id = pool_id), samples_meta) %>% pull(tigl_id) |
|
|
125 |
names(TRUST4_outputs)<- tigl_id |
|
|
126 |
|
|
|
127 |
TRUST4_outs<- map(TRUST4_outputs, read_csv) |
|
|
128 |
|
|
|
129 |
rename_barcode<- function(df, tigl_id){ |
|
|
130 |
df<- df %>% |
|
|
131 |
mutate(barcode = paste(tigl_id, barcode, sep="-")) %>% |
|
|
132 |
mutate(barcode = str_replace(barcode, "-1$", "")) |
|
|
133 |
return(df) |
|
|
134 |
} |
|
|
135 |
|
|
|
136 |
TRUST4_outs<- purrr::map2(TRUST4_outs, names(TRUST4_outs), ~rename_barcode(df = .x, tigl_id = .y)) |
|
|
137 |
|
|
|
138 |
dir.create(here("data/cd3_minus_TRUST4_v1.0.1_BCR_renamed")) |
|
|
139 |
walk2(TRUST4_outs, pool_id, |
|
|
140 |
~ write_csv(.x, here(paste0("data/cd3_minus_TRUST4_v1.0.1_BCR_renamed/", .y, "_TRUST4_b.csv")))) |
|
|
141 |
``` |
|
|
142 |
|
|
|
143 |
|
|
|
144 |
|
|
|
145 |
```{r} |
|
|
146 |
library(tidyverse) |
|
|
147 |
library(here) |
|
|
148 |
cd3_minus_annotated_meta<- read_tsv(here("data/cd3_minus_annotated_seurat_meta.tsv"), |
|
|
149 |
col_types = cols(.default = col_character())) |
|
|
150 |
head(cd3_minus_annotated_meta$barcode) |
|
|
151 |
## this metadata is at sample level |
|
|
152 |
samples_meta<- cd3_minus_annotated_meta %>% |
|
|
153 |
select(tigl_id:cohort2, group) %>% |
|
|
154 |
distinct(.keep_all = TRUE) |
|
|
155 |
``` |
|
|
156 |
|
|
|
157 |
immunarch is very picky about the name of the files. one should not have `barcode` in the path! (took me a long time to figure it out...) |
|
|
158 |
|
|
|
159 |
```{r} |
|
|
160 |
library(immunarch) |
|
|
161 |
library(here) |
|
|
162 |
library(tidyverse) |
|
|
163 |
library(PMCMRplus) |
|
|
164 |
|
|
|
165 |
###### do not use immunarch for importing, we want to filter the cells based on scRNAseq data #### |
|
|
166 |
# we can try both paired heavy chain and light chain |
|
|
167 |
# or only heavy chain using .mode = "single" |
|
|
168 |
#cd3_minus_BCR <- repLoad(here("data/cd3_minus_TRUST4_v1.0.1_BCR_renamed"), .mode = "paired") |
|
|
169 |
#cd3_minus_BCR <- repLoad(here("data/cd3_minus_TRUST4_v1.0.1_BCR_renamed"), .mode = "single") |
|
|
170 |
## subset only the heavy chain |
|
|
171 |
#IGH_indx<- str_detect(cd3_minus_BCR$meta$Chain, "IGH") |
|
|
172 |
#cd3_minus_BCR$data<- cd3_minus_BCR$data[IGH_indx] |
|
|
173 |
|
|
|
174 |
#cd3_minus_BCR$meta<- cd3_minus_BCR$meta[IGH_indx, ] |
|
|
175 |
#map(cd3_minus_BCR$data, ~ filter(.x, str_detect(V.name, "IGKV1-39"))) |
|
|
176 |
#dplyr::filter(cd3_minus_BCR$data$Pool84_6_TRUST4_b, str_detect(V.name, "IGKV1-39") ) %>% View() |
|
|
177 |
#View(cd3_minus_BCR$data$Pool87_8_TRUST4_b) |
|
|
178 |
#map(cd3_minus_BCR$data, ~ filter(.x, str_detect(V.name, "IGKV1D-39"))) |
|
|
179 |
|
|
|
180 |
|
|
|
181 |
scBCR_files<- list.files(here("data/cd3_minus_TRUST4_v1.0.1_BCR_renamed"), full.names = TRUE) |
|
|
182 |
read_scBCR<- function(file){ |
|
|
183 |
scBCR<- read_csv(file) |
|
|
184 |
scBCR<- scBCR %>% |
|
|
185 |
filter(cdr3 != "None") |
|
|
186 |
scBCR$pool_id<- str_replace(basename(file), "_TRUST4_b.csv", "") |
|
|
187 |
scBCR<- left_join(scBCR, samples_meta, by = c("pool_id" = "pool_id")) |
|
|
188 |
return(scBCR) |
|
|
189 |
} |
|
|
190 |
scBCR_data<- map(scBCR_files, read_scBCR) |
|
|
191 |
scBCR_data<- bind_rows(scBCR_data) |
|
|
192 |
|
|
|
193 |
write_csv(scBCR_data, here("results/figures/scBCR_TRUST4_out_no_filtering_heavy_and_light_chain.csv")) |
|
|
194 |
|
|
|
195 |
## also, only filter the heavy chain and productive clones |
|
|
196 |
scBCR_heavy<- scBCR_data %>% |
|
|
197 |
filter(chain =="IGH") %>% |
|
|
198 |
filter(productive) %>% |
|
|
199 |
distinct(barcode, v_gene, j_gene, cdr3, .keep_all = TRUE) |
|
|
200 |
|
|
|
201 |
|
|
|
202 |
``` |
|
|
203 |
|
|
|
204 |
|
|
|
205 |
### remove all the cells without BCR info and retain only cells in the B cell space. |
|
|
206 |
|
|
|
207 |
```{r} |
|
|
208 |
cd3_minus_meta<- read_tsv(here('data/cd3_minus_metadata.tsv'), guess_max = 1000000) |
|
|
209 |
|
|
|
210 |
scBCR_heavy<-scBCR_heavy %>% |
|
|
211 |
inner_join(cd3_minus_meta %>% select(cell, clusters_anno_sub), by=c("barcode" = "cell")) |
|
|
212 |
|
|
|
213 |
scBCR_heavy<- scBCR_heavy %>% |
|
|
214 |
mutate(cloneType_heavy = paste0("clonetype_heavy_", group_indices_(., .dots = c('v_gene', 'j_gene', 'cdr3')))) |
|
|
215 |
|
|
|
216 |
scBCR_heavy$group<- factor(scBCR_heavy$group, levels = c("Healthy_donors", |
|
|
217 |
"New_C1D1", |
|
|
218 |
"refractory_CR_C1D1", |
|
|
219 |
"refractory_PR_C1D1", |
|
|
220 |
"refractory_PD_C1D1", |
|
|
221 |
"refractory_CR_C4D1", |
|
|
222 |
"refractory_PR_C4D1", |
|
|
223 |
"refractory_PD_C4D1")) |
|
|
224 |
|
|
|
225 |
write_csv(scBCR_heavy, here("results/figures/scBCR_productive_heavy_chain_B_cells_only.csv")) |
|
|
226 |
``` |
|
|
227 |
|
|
|
228 |
|
|
|
229 |
### diversity |
|
|
230 |
|
|
|
231 |
```{r} |
|
|
232 |
library(tcR) |
|
|
233 |
## count unique clonotype |
|
|
234 |
count_unique_clonotype<- function(df){ |
|
|
235 |
df<- df %>% |
|
|
236 |
count(pool_id) |
|
|
237 |
return(df) |
|
|
238 |
} |
|
|
239 |
## chao1 diversity |
|
|
240 |
calculate_diversity<- function(df){ |
|
|
241 |
df<- df %>% |
|
|
242 |
group_by(pool_id) %>% |
|
|
243 |
summarize(diversity = chao1(Read.count)[1]) |
|
|
244 |
return(df) |
|
|
245 |
} |
|
|
246 |
|
|
|
247 |
|
|
|
248 |
scBCR_heavy_nest<- scBCR_heavy %>% |
|
|
249 |
group_by(group) %>% |
|
|
250 |
count(pool_id, cloneType_heavy, name = "Read.count") %>% |
|
|
251 |
nest(-group) |
|
|
252 |
|
|
|
253 |
|
|
|
254 |
scBCR_unique_clonotype<- scBCR_heavy_nest %>% |
|
|
255 |
mutate(diversity = map(data, count_unique_clonotype)) %>% |
|
|
256 |
select(-data) %>% |
|
|
257 |
unnest(diversity) |
|
|
258 |
|
|
|
259 |
|
|
|
260 |
scBCR_diversity<- scBCR_heavy_nest %>% |
|
|
261 |
mutate(diversity = map(data, calculate_diversity)) %>% |
|
|
262 |
select(-data) %>% |
|
|
263 |
unnest(diversity) |
|
|
264 |
|
|
|
265 |
``` |
|
|
266 |
|
|
|
267 |
|
|
|
268 |
### plotting |
|
|
269 |
|
|
|
270 |
|
|
|
271 |
```{r} |
|
|
272 |
scBCR_unique_clonotype %>% |
|
|
273 |
ggplot(aes(x=group, y = n)) + |
|
|
274 |
geom_boxplot(aes(fill = group), outlier.shape = NA) + |
|
|
275 |
geom_point() + |
|
|
276 |
theme_bw(base_size = 14) + |
|
|
277 |
theme(axis.text.x = element_blank(), |
|
|
278 |
axis.ticks.x = element_blank(), |
|
|
279 |
strip.text = element_text(size = 8)) + |
|
|
280 |
xlab("") + |
|
|
281 |
ylab("unique clonotype") + |
|
|
282 |
ggtitle("BCR heavy unique clonotype") |
|
|
283 |
ggsave(here("results/figures/BCR_unique_clonotype_filtered_BCR_heavy_per_sample.pdf"), width =8, height = 6) |
|
|
284 |
|
|
|
285 |
write_csv(scBCR_unique_clonotype,here("results/figures/BCR_unique_clonotype_filtered_BCR_heavy_per_sample.csv")) |
|
|
286 |
|
|
|
287 |
scBCR_diversity %>% |
|
|
288 |
ggplot(aes(x=group, y = diversity)) + |
|
|
289 |
geom_boxplot(aes(fill = group), outlier.shape = NA) + |
|
|
290 |
geom_point() + |
|
|
291 |
theme_bw(base_size = 14) + |
|
|
292 |
theme(axis.text.x = element_blank(), |
|
|
293 |
axis.ticks.x = element_blank(), |
|
|
294 |
strip.text = element_text(size = 8)) + |
|
|
295 |
xlab("") + |
|
|
296 |
ylab("chao1 index") + |
|
|
297 |
ggtitle("chao1 BCR heavy diversity") |
|
|
298 |
ggsave(here("results/figures/BCR_chao1_filtered_BCR_heavy_per_sample.pdf"), width =8, height = 6) |
|
|
299 |
|
|
|
300 |
write_csv(scBCR_diversity, here("results/figures/BCR_chao1_filtered_BCR_heavy_per_sample.csv")) |
|
|
301 |
|
|
|
302 |
``` |
|
|
303 |
|
|
|
304 |
|
|
|
305 |
```{r} |
|
|
306 |
cells_per_sample<- scBCR_heavy %>% |
|
|
307 |
count(group, pool_id, name = "total_number_cells") |
|
|
308 |
|
|
|
309 |
|
|
|
310 |
cells_per_sample %>% |
|
|
311 |
ggplot(aes(x=group, y = total_number_cells)) + |
|
|
312 |
geom_boxplot(aes(fill = group), outlier.shape = NA) + |
|
|
313 |
geom_point() + |
|
|
314 |
theme_bw(base_size = 14) + |
|
|
315 |
theme(axis.text.x = element_blank(), |
|
|
316 |
axis.ticks.x = element_blank(), |
|
|
317 |
strip.text = element_text(size = 8)) + |
|
|
318 |
xlab("") + |
|
|
319 |
ylab("number of cells") + |
|
|
320 |
ggtitle("total number of cells") |
|
|
321 |
ggsave(here("results/figures/BCR_total_number_cells_BCR_heavy_per_sample.pdf"), width =8, height = 6) |
|
|
322 |
|
|
|
323 |
write_csv(cells_per_sample, here("results/figures/BCR_total_number_cells_BCR_heavy_per_sample.csv")) |
|
|
324 |
``` |
|
|
325 |
|
|
|
326 |
|