-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathLCA_pipeline_setup.sh
executable file
·436 lines (379 loc) · 17 KB
/
LCA_pipeline_setup.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
#!/bin/bash
# A: Lisa Sikkema, 2020
# D: download files, set up conda environment and build reference genome for Lung Cell Atlas cellranger pipeline
# version of pipeline
pipeline_version="1.0.0"
# parameter defaults:
# Ensembl release
ensrel="93"
# Genome string
genomestring="GRCh38"
# species
species="homo_sapiens"
# Expected cell ranger version
expectedcrv="3.1.0"
# Resources for mkref
# number of cores
nthreads="20"
# Memory in GB
memgb="48"
# Include Sars-cov2 genome:
incl_sarscov2=false
# script parts to run/skip:
download_files=true
create_env=true
build_ref_genome=true
download_ensembl_files=true
usage() {
cat <<HELP_USAGE
Lung Cell Atlas pipeline version: v${pipeline_version}
Usage: $(basename "$0") [-hwcutmsegDCRLS]
-h show this help text
Mandatory arguments:
-w <path_to_work_dir> path to working directory. Testdata, reference
genomes, log files etc. will be stored here.
-c <path_to_envs_dir> path to envs directory from your miniconda or
anaconda, with trailing slash, e.g.
/Users/doejohn/miniconda3/envs/
-u <user:pass> user:pass received from Lung Cell Atlas to
acquire access to files for download
Optional arguments (with default settings):
-t <n_threads> number of threads to be used by cellranger mkref
(default: ${nthreads})
-m <mem_in_Gb> memory in GB to be used by cellranger mkref
(default: ${memgb})
-s <species> species for genome building. Use homo_sapiens
for human. (default: ${species})
-e <ensembl_release> ensembl release of reference genome to build.
(default: ${ensrel})
-g <genome_release> genome release, must match with the
corresponding Ensembl release (see also Ensembls
fa file name for the release), use without the
patch id (e.g. GRCh37; GRCh38, GRCm38)
(default: ${genomestring})
Optional arguments to skip parts of script:
-D <true|false> include download of required files. Set this
to false if files were already downloaded and
you want to skip this step. (default: ${create_env})
-C <true|false> include creation of conda environment. Set this to
false if conda environment is already created
and you want to skip this step.
(default: ${download_files})
-R <true|false> include building of reference genome. Set this
to false if reference genome has already been
built and you want to skipt this step.
(default: ${build_ref_genome})
-L <true|false> include download of ensembl files needed to build
reference genome. Set to false if the files were already
downloaded in a previous run. Set to true otherwise.
If set to true, any existing folders in your
[work_dir]/refgenomes/ directory matching with the
specified species, ensembl_release and genome_release
will be removed. If set to false, the necessary
downloaded files should be present in your
refgenomes/${species}_${genomestring}_ensrel${ensrel}
folder. (default: ${download_ensembl_files})
Optional argument to include Sars-CoV2 in the reference genome:
-S <true|false> include Sars-cov2 in the reference genome. Set
this to true if Sars-cov2 should be included.
(default: ${incl_sarscov2})
HELP_USAGE
}
# TAKE IN ARGUMENTS
# go through optional arguments:
# preceding colon after getopts: don't allow for any automated error messages
# colon following flag letter: this flag takes in an argument
while getopts ":hw:c:u:t:m:s:e:g:D:C:R:L:S:" opt; do
# case is bash version of 'if' that allows for multiple scenarios
case "$opt" in
# if h is added, print usage and exit
h ) usage
exit
;;
t ) nthreads=$OPTARG
;; # continue
m ) memgb=$OPTARG
;;
s ) species=$OPTARG
;;
e ) ensrel=$OPTARG
;;
g ) genomestring=$OPTARG
;;
w ) work_dir=$OPTARG
;;
c ) conda_envs_dir=$OPTARG
;;
u ) user_pass=$OPTARG
;;
D ) download_files=$OPTARG
;;
C ) create_env=$OPTARG
;;
R ) build_ref_genome=$OPTARG
;;
L ) download_ensembl_files=$OPTARG
;;
S ) incl_sarscov2=$OPTARG
;;
# if unknown flag, print error message and put it into stderr
\? ) echo "Invalid option: $OPTARG" >&2
usage
exit 1
;;
# if argument is missing from flag that requires argument,
# print error and exit 1.
# the print and echo outputs are sent to stderr file?
# then exit 1
: ) printf "missing argument for -%s\n" "$OPTARG" >&2
echo "$usage" >&2
exit 1
;;
esac
done
# move to next argument, and go through loop again:
shift $((OPTIND -1))
# CHECK PASSED ARGUMENTS/PARAMETERS:
# check if arguments specifying steps to skip/include (-DCR) are set to either true or false
# first convert them to lower case:
download_files="${download_files,,}"
download_ensembl_files="${download_ensembl_files,,}"
create_env="${create_env,,}"
build_ref_genome="${build_ref_genome,,}"
incl_sarscov2="${incl_sarscov2,,}"
# check if they're set to either true or false
if [ "$download_files" != "true" ] && [ "$download_files" != "false" ]; then
echo "-D flag can only be set to 'true' or 'false'! exiting."
exit 1
fi
# do the same for create_env:
if [ "$create_env" != "true" ] && [ "$create_env" != "false" ]; then
"-C flag can only be set to 'true' or 'false'! exiting."
exit 1
fi
# and for ref genome:
if [ "$build_ref_genome" != "true" ] && [ "$build_ref_genome" != "false" ]; then
echo "-R flag can only be set to 'true' or 'false'! exiting."
exit 1
fi
# if files have to be downloaded, then check if user_pass argument was provided:
# -z is True if string has length 0
if [ "$download_files" == "true" ] && [ -z $user_pass ]; then
echo "user pass argument (-u flag) not provided. Exiting."
exit 1
fi
# check if conda_envs_dir argument was passed:
if [[ ("$create_env" == "true" || "$build_ref_genome" == "true") && -z $conda_envs_dir ]]; then
echo "conda environment directory (-c flag) argument not provided. Exiting."
exit 1
fi
# check if conda_envs_dir is a directory:
if [ "$create_env" == "true" ] || [ "$build_ref_genome" == "true" ]; then
# check if it is a directory (if not: exit), and if directory ends with /envs. (If not, only print a warning.)
if ! [ -d $conda_envs_dir ]; then
echo "Specified conda_envs_dir $conda_envs_dir does not exist. Exiting."
exit 1
elif [[ $conda_envs_dir != */envs/ ]]; then
echo "Warning: Specified conda envs directory $conda_envs_dir does not end with \"/envs/\". Make sure a trailing slash is included. Is this the correct directory?"
fi
fi
# check if workdir argument was passed:
if [ -z $work_dir ]; then
echo "no argument provided for -w flag (work_dir). Exiting."
exit 1
fi
# check if workdir is a directory
if ! [ -d $work_dir ]; then
echo "the provided work_dir (-w flag) is not a directory. Exiting."
exit 1
fi
# if reference genome build is included, check if $incl_sarscov2 is either true or false:
if [ "$build_ref_genome" == "true" ]; then
if [ "$incl_sarscov2" != "true" ] && [ "$incl_sarscov2" != "false" ]; then
echo "-S flag can only be set to 'true' or 'false'! exiting."
exit 1
fi
fi
# if reference genome build is included, check if $download_ensembl_files is set to either true or false
if [ "$build_ref_genome" == "true" ]; then
if [ "$download_ensembl_files" != "true" ] && [ "$download_ensembl_files" != "false" ]; then
echo "-L flag can only be set to 'true' or 'false'! exiting."
exit 1
fi
fi
# CREATE LOG AND PRINT PARAMETERS
# store path where current script is located as script_dir
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
# cd into workpath and store full path (wihtout trailing slash)
cd $work_dir
work_dir=`pwd`
# Log filenname
LOGFILE=$work_dir/"LOG_LCA_pipeline_setup.log"
# Check if logfile exists
if [ -f ${LOGFILE} ]; then
echo "ERROR: LOG file ${LOGFILE} already exists. please remove. exit."
exit 1
fi
# Create log file and add DATE
echo `date` > ${LOGFILE}
echo "Log file saved in : ${LOGFILE}"
# print pipeline version
echo "Lung Cell Atlas pipeline version: v${pipeline_version}" | tee -a ${LOGFILE}
# Print which steps will be skipped or included:
echo "STEPS TO BE INCLUDED/SKIPPED:" | tee -a ${LOGFILE}
# print for each step if it is included or skipped
if [ "$download_files" == "true" ]; then
echo "downloading of required files will be included" | tee -a ${LOGFILE}
elif [ "$download_files" == "false" ]; then
echo "downloading of required files will be skipped" | tee -a ${LOGFILE}
fi
# do the same for create_env:
if [ "$create_env" == "true" ]; then
echo "creation of conda environment will be included" | tee -a ${LOGFILE}
elif [ "$create_env" == "false" ]; then
echo "creation of conda environment will be skipped" | tee -a ${LOGFILE}
fi
# and for build_ref_genome
if [ "$build_ref_genome" == "true" ]; then
echo "building of reference genome will be included" | tee -a ${LOGFILE}
if [ "$download_ensembl_files" == "false" ]; then
echo "download of files from ensembl needed for refgenome building will be skipped." | tee -a ${LOGFILE}
fi
if [ "$incl_sarscov2" == "true" ]; then
echo "Sars-cov2 genome will be added to the reference genome." | tee -a ${LOGFILE}
fi
elif [ "$build_ref_genome" == "false" ]; then
echo "building of reference genome will be skipped" | tee -a ${LOGFILE}
fi
# print parameters. tee command (t-split) splits output into normal printing and a second target,
# in this case the log file to which it will -a(ppend) the output.
# i.e. parameters are printed and stored in logfile.
echo "Params:" | tee -a ${LOGFILE}
echo "cellranger version expected: ${expectedcrv}, Ensembl release: ${ensrel}, Genome release: ${genomestring}, Species: ${species}" | tee -a ${LOGFILE}
echo "nthreads: ${nthreads}, memgb: ${memgb}" | tee -a ${LOGFILE}
echo "user:pass provided" | tee -a ${LOGFILE}
echo "work directory: $work_dir" | tee -a ${LOGFILE}
echo "conda_envs_dir: $conda_envs_dir" | tee -a ${LOGFILE}
# let user confirm parameters:
read -r -p "Are the parameters correct? Continue? [y/N] " response
if [[ "$response" =~ ^([yY][eE][sS]|[yY])$ ]]
then
echo "Parameters confirmed." | tee -a ${LOGFILE}
else
echo "Parameters not confirmed, exit." | tee -a ${LOGFILE}
exit 1
fi
# DOWNLOAD THE REQUIRED FILES, if $download_files == true:
if [ "$download_files" == "true" ]; then
# now download the tar file:
echo "We will download the necessary files now, this shouldn't take too long..." | tee -a ${LOGFILE}
curl --user $user_pass https://hmgubox2.helmholtz-muenchen.de/public.php/webdav/LCA_pipeline_downloads.tar.gz --output $work_dir/LCA_pipeline_downloads.tar.gz -k
curl --user $user_pass https://hmgubox2.helmholtz-muenchen.de/public.php/webdav/LCA_pipeline_downloads_CHECKSUM --output $work_dir/LCA_pipeline_downloads_CHECKSUM -k
echo "Done." | tee -a ${LOGFILE}
# validate that file is intact and not corrupted using checksum:
echo "Checking md5sum of downloaded file..." | tee -a ${LOGFILE}
md5sum -c $work_dir/LCA_pipeline_downloads_CHECKSUM | tee -a ${LOGFILE}
# now unpack downloaded file:
echo "Unpacking downloaded tar file now..." | tee -a ${LOGFILE}
tar -xzvf $work_dir/LCA_pipeline_downloads.tar.gz | tee -a ${LOGFILE}
echo "Done" | tee -a ${LOGFILE}
# move directories up one folder and remove donwload dir + tar
mv $work_dir/LCA_pipeline_downloads/* $work_dir/ 2>&1 | tee -a ${LOGFILE}
rmdir $work_dir/LCA_pipeline_downloads 2>&1 | tee -a ${LOGFILE}
rm $work_dir/LCA_pipeline_downloads.tar.gz 2>&1 | tee -a ${LOGFILE}
fi
# CREATE CONDA ENVIRONMENT, if $create_env == true:
path_to_env="${conda_envs_dir}cr3-velocyto-scanpy"
if [ "$create_env" == "true" ]; then
echo "Creating conda environment in ${path_to_env}... NOTE! This can take a few hours..." | tee -a ${LOGFILE}
echo "start time: `date`" | tee -a ${LOGFILE}
# script seems to terminate after conda create command, so now running from subshell (parentheses)
(
conda create --prefix $path_to_env -c $work_dir/conda-bld -c conda-forge -c bioconda -y cellranger=3.1.0=0 scanpy=1.4.4.post1=py_3 velocyto.py=0.17.17=py36hc1659b7_0 samtools=1.10=h9402c20_2 conda=4.8.2=py36_0 nextflow=19.10 java-jdk=8.0.112 | tee -a ${LOGFILE}
) | tee -a ${LOGFILE}
echo "End time: `date`" | tee -a ${LOGFILE}
fi
# BUILD REFERENCE GENOME, if $build_ref_genome == true:
if [ "$build_ref_genome" == "true" ]; then
# store conda version:
conda_version_long=`conda --version`
# remove "conda " prefix if it's there
prefix="conda "
conda_version=${conda_version_long#"$prefix"}
# now see if version is greater/smaller than or equal to 4.6 for conda --base command:
min_version="4.6"
if `$script_dir/src/versiontester.sh $conda_version $min_version '>'` || `$script_dir/src/versiontester.sh $conda_version $min_version '='`; then
# activate environment prep. Since the command conda activate doesn't (always?) work
# in subshell, we first want to use source to make the command available:
echo "running source [path to conda.sh]..." | tee -a ${LOGFILE}
path_to_conda_sh=$(conda info --base)/etc/profile.d/conda.sh
source $path_to_conda_sh
elif $script_dir/src/versiontester.sh $conda_version $min_version '<'; then
# do nothing?
:
else
echo "conda version checker did not work properly. Exiting." | tee -a ${LOGFILE}
exit 0
fi
# conda environment activation works differently before and after version 4.4
min_version="4.4"
if `$script_dir/src/versiontester.sh $conda_version $min_version '>'` || `$script_dir/src/versiontester.sh $conda_version $min_version '='`; then
# activate environment using conda activate
echo "Activating environment using conda activate..." | tee -a ${LOGFILE}
conda activate $path_to_env # this cannot be put into LOGFILE, because then the conda environment is not properly activated for some reason.
elif $script_dir/src/versiontester.sh $conda_version $min_version '<'; then
# activate environment using source activate
echo "Activating environment using source activate..." | tee -a ${LOGFILE}
source activate $path_to_env
else
echo "conda version checker did not work properly. Exiting." | tee -a ${LOGFILE}
exit 0
fi
# now start building the reference genome!
# create (if not there yet) and cd into refgenomes folder;
if ! [ -d refgenomes ]; then
mkdir refgenomes
fi
cd refgenomes
# create/cd into folder that corresponds to our species and genome version, so that no mixing of versions is possible
genome_name=${species}_${genomestring}_ensrel${ensrel}
if ! [ -d $genome_name ]; then
mkdir $genome_name
fi
cd $genome_name
echo "Currently working in folder `pwd`" | tee -a ${LOGFILE}
# now run the script to build the genome:
echo "We will now start building the reference genome, using the script $script_dir/src/create_cellranger_ref_from_ensembl.sh" | tee -a ${LOGFILE}
echo "This might take a few hours. Start time: `date`" | tee -a ${LOGFILE}
echo "For a detailed log of the genome building, check out the logfile in your ${work_dir}/refgenomes/${genome_name} folder!" | tee -a ${LOGFILE}
if [ "$incl_sarscov2" == "false" ]; then
if [ "$download_ensembl_files" == "true" ]; then
# -d default true, u true
${script_dir}/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -u true -o ${work_dir}/refgenomes/${genome_name} | tee -a ${LOGFILE}
else
# -d to false
${script_dir}/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -d false -o ${work_dir}/refgenomes/${genome_name} | tee -a ${LOGFILE}
fi
# check md5sum if default parameters were used:
if [ "${genomestring}" == "GRCh38" ] && [ "${ensrel}" == "93" ] && [ "${species}" == "homo_sapiens" ] && [ "${expectedcrv}" == "3.1.0" ]; then
echo "Checking md5sum of output folder..." | tee -a ${LOGFILE}
md5sum -c $script_dir/src/refgenomes_md5checks/homo_sapiens_GRCh38_ensrel93_cr3.1.0.md5 | tee -a ${LOGFILE}
fi
elif [ "$incl_sarscov2" == "true" ]; then
echo "Including Sars-cov2 genome into the reference..." | tee -a ${LOGFILE}
if [ "$download_ensembl_files" == "true" ]; then
# -d default true, u true
$script_dir/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -u true -n sars_cov2 -f $script_dir/res/sars_cov2_genome/sars_cov2_genome.gtf -a $script_dir/res/sars_cov2_genome/sars_cov2.fasta -o ${work_dir}/refgenomes/${genome_name} | tee -a ${LOGFILE}
else
# -d to false
$script_dir/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -d false -n sars_cov2 -f $script_dir/res/sars_cov2_genome/sars_cov2_genome.gtf -a $script_dir/res/sars_cov2_genome/sars_cov2.fasta -o ${work_dir}/refgenomes/${genome_name} | tee -a ${LOGFILE}
fi
# # check md5sum if default parameters were used (didn't generate the checksum file for ensemble 93 yet):
# if [ ${genomestring} == "GRCh38" ] && [ "${ensrel}" == "93" ] && [ "${species}" == "homo_sapiens" ] && [ "${expectedcrv}" == "3.1.0" ]; then
# echo "Checking md5sum of output folder..." | tee -a ${LOGFILE}
# md5sum -c $script_dir/src/refgenomes_md5checks/homo_sapiens_GRCh38_ensrel93_cr3.1.0_sars_cov2.md5 | tee -a ${LOGFILE}
# fi
fi
echo "End time: `date`" | tee -a ${LOGFILE}
fi
echo 'End of script.' | tee -a ${LOGFILE}