-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathrun
executable file
·242 lines (190 loc) · 6.48 KB
/
run
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/env perl
use strict;
use warnings;
use File::Basename;
use File::Spec;
use Cwd;
use Cwd 'abs_path';
use Time::HiRes;
my $HELP = <<HELP;
Run pipeline.
Must be executed from a project directory that contains the samples.fastq-raw.csv file.
usage: run route
* route = name of pipeline (corresponds to a file in routes directory)
HELP
if (!$ARGV[0]) {
die $HELP;
}
main();
# main subroutine
sub main {
my $route = $ARGV[0];
# current dir (should be the project dir)
my $project_dir = getcwd();
print "project dir: $project_dir \n";
# code ("sns") dir (where this file is)
my $code_dir = File::Spec->rel2abs(__FILE__);
$code_dir = dirname($code_dir);
chomp($code_dir);
print "code dir: $code_dir \n";
sleep(1);
# check that the specified route exists
my $route_script = "${code_dir}/routes/${route}.sh";
unless ( -e $route_script ) {
die "\n\n ERROR: $route DOES NOT EXIST \n\n";
}
# check that settings file exists
my $settings_txt = "${project_dir}/settings.txt";
unless ( -e $settings_txt ) {
die "\n\n ERROR: $settings_txt DOES NOT EXIST \n\n";
}
# move to sbatch logs dir so the sbatch output ends up there
my $sbatch_logs_dir = "${project_dir}/logs-sbatch";
system("mkdir -p $sbatch_logs_dir");
print "logs dir: $sbatch_logs_dir \n";
chdir($sbatch_logs_dir);
# determine sample sheet depending on route type
my $samples_csv = get_sample_sheet($project_dir, $code_dir, $route);
print "sample sheet: $samples_csv \n";
sleep(1);
if ($route =~ "-groups-") {
# grouped samples route (all samples processed together)
run_groups($project_dir, $route, $route_script);
}
else {
# generic single-sample route or paired samples route (each sample or pair processed separately)
run_samples($project_dir, $route, $route_script, $samples_csv);
}
}
# get the sample sheet depending on the route type
sub get_sample_sheet {
my $project_dir = $_[0];
my $code_dir = $_[1];
my $route = $_[2];
my $samples_csv;
if ($route =~ "-groups-") {
$samples_csv = abs_path("${project_dir}/samples.groups.csv");
}
elsif ($route =~ "-pairs-") {
$samples_csv = abs_path("${project_dir}/samples.pairs.csv");
}
else {
$samples_csv = abs_path("${project_dir}/samples.fastq-raw.csv");
# remove problematic characters from all existing samples sheets
my @all_samples_csvs = `ls -1 ${project_dir}/samples.*.csv`;
chomp(@all_samples_csvs);
while (my $csv = shift(@all_samples_csvs)) {
system("bash ${code_dir}/scripts/fix-csv.sh $csv");
}
sleep(1);
}
# check that sample sheet exists
unless ( -e $samples_csv ) {
die "\n\n ERROR: SAMPLE SHEET $samples_csv DOES NOT EXIST \n\n";
}
# check that sample sheet is not empty
if ( -z $samples_csv ) {
die "\n\n ERROR: SAMPLE SHEET $samples_csv IS EMPTY \n\n";
}
# remove problematic characters from sample sheet
system("bash ${code_dir}/scripts/fix-csv.sh $samples_csv");
return($samples_csv);
}
# process samples and submit a job for each
sub run_samples {
my $project_dir = $_[0];
my $route = $_[1];
my $route_script = $_[2];
my $samples_csv = $_[3];
# get the maximum runtime for route (for sbatch)
my $route_time = `cat $route_script | grep -m 1 "SBATCHTIME="`;
chomp($route_time);
$route_time =~ s/.*=//;
if (length($route_time) < 3 || length($route_time) > 15) {
die "\n\n ERROR: ROUTE RUNTIME $route_time NOT PROPERLY SPECIFIED IN ROUTE \n\n";
}
# find unique samples based on the sample sheet
my @unique_samples = `cat $samples_csv | cut -d ',' -f 1 | LC_ALL=C sort | uniq`;
if ($route =~ "-pairs-") {
@unique_samples = `cat $samples_csv | grep -v "#SAMPLE" | tr ',' ':' | LC_ALL=C sort | uniq`;
}
chomp(@unique_samples);
# check that sample names do not contain problematic characters
my @invalid_samples = grep(/[^\w\-\.\:\~]/, @unique_samples);
if (@invalid_samples) {
my $invalid_samples_str = join(', ', @invalid_samples);
die "\n\n ERROR: SAMPLES $invalid_samples_str CONTAIN PROBLEMATIC CHARACTERS \n\n";
}
# check that pairs are valid (default sample sheet has normals as "NA")
if ($route =~ "-pairs-") {
my @invalid_pairs = grep(/:NA$/, @unique_samples);
if (@invalid_pairs) {
my $invalid_pairs_str = join(', ', @invalid_pairs);
die "\n\n ERROR: PAIRS $invalid_pairs_str CONTAIN MISSING SAMPLES (LABELED NA) \n\n";
}
}
# request fewer threads per sample for larger projects (extra thread for overhead)
my $num_samples = @unique_samples;
my $threads = "9";
if ($num_samples > 50) {
# deactivated due to causing memory issues in rare cases
# $threads = "5";
}
# process each sample
my $current_sample = 1;
while (my $sample = shift(@unique_samples)) {
print "\n process sample $sample \n";
# route command
my $cmd = "bash $route_script $project_dir $sample";
# split paired samples for command (needs two samples)
if ($route =~ "-pairs-") {
$sample =~ s/:/-/;
$cmd =~ s/:/ /;
}
# check for "test" argument to test one sample without sbatch (for troubleshooting)
if (defined($ARGV[1]) && $ARGV[1] eq "test") {
print "\n\n test run started \n\n";
system("export SLURM_CPUS_PER_TASK=5 && $cmd");
print "\n\n test run finished \n\n";
exit;
}
# sbatch command
my $email = '${USER}@nyulangone.org';
$email = `echo $email`;
chomp($email);
my $sbatch_name = "--job-name=sns.${route}.${sample}";
my $sbatch_mail = "--mail-user=${email} --mail-type=FAIL,REQUEUE";
my $mem = $threads * 8;
my $sbatch_res = "--time=${route_time} --nodes=1 --ntasks=1 --cpus-per-task=${threads} --mem=${mem}G";
my $sbatch_cmd = "sbatch $sbatch_res $sbatch_name $sbatch_mail --export=NONE --wrap='${cmd}'";
print "\n CMD: $sbatch_cmd \n";
system($sbatch_cmd);
# pause between samples (sometimes job scheduler gets overwhelmed)
sleep(0.2);
# extra pause for the first few samples so it's easier to fix obvious errors
if ($current_sample < 5) {
sleep(15);
}
$current_sample++;
}
}
# process sample groups (all samples processed together)
sub run_groups {
my $project_dir = $_[0];
my $route = $_[1];
my $route_script = $_[2];
# route command
my $cmd = "bash $route_script $project_dir";
# system($cmd);
# sbatch command
my $email = '${USER}@nyulangone.org';
$email = `echo $email`;
chomp($email);
my $sbatch_name = "--job-name=sns.${route}";
my $sbatch_mail = "--mail-user=${email} --mail-type=FAIL,REQUEUE";
my $sbatch_res = "--time=10:00:00 --nodes=1 --ntasks=1 --cpus-per-task=5 --mem=50G";
my $sbatch_cmd = "sbatch $sbatch_res $sbatch_name $sbatch_mail --export=NONE --wrap='${cmd}'";
print "\n CMD: $sbatch_cmd \n";
system($sbatch_cmd);
}
# end