-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfilterStudyAnimalSpeciesStrain.R
384 lines (335 loc) · 21.1 KB
/
filterStudyAnimalSpeciesStrain.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
###################################################################################
# Script name : filterStudyAnimalSpeciesStrain.R
# Date Created : 16-Jan-2020
# Programmer : Bo Larsen
# --------------------------------------------------------------------------------
# Change log:
# Programmer/date Description
# ----------------- ------------------------------------------------------------
# <init/dd-Mon-yyyy> <description>
#
# -------------------------------------------------------------------------------
# Purpose : Extract studies and animals which fulfills a specified species
# value and optional strain value from a pooled SEND data store.
#
# Description : Function FilterAnimalsSpeciesStrain:
#
# Returns a data table with a set of animals extracted from the table of
# animals given as input where the animals fits the species value(s) given
# as input (a list of multiple species values may be given as input)
# and also fits the strain value(s) if given as input (optional).
# Animals from the input set, which are included in studies where one of
# of the input species values - and optionally also strain values - is
# registered in TS (and TS have only this species or species/strain
# value included) are included in the output set.
# Animals from the input set, belonging to studies
# - with no species/strain value registered or
# - with more than one species/strain value registered
# in TS are included in the output set if they exists in DM with matching
# DM.SPECIES or DM.SPECIES/STRAIN values or are included in TX in a trial set
# with matching TX.SPECIES or TX.SPECIES/STRAIN (i.e. TXVAL where TXPARMCD is
# SPECIES' or 'STRAIN')
# The comparisons of the species and strain values are done case
# insensitive.
#
# If the input parameter inclUncertain flag is enabled, uncertain animals
# are included in the output set.
# These uncertain situations are identified and reported for SPECIES and STRAIN respectively
# (in column UNCERTAIN_MSG):
# - TS parameter SPECIES/STRAIN is missing or invalid (not CT value - CDISC code list SPECIES/STRAIN)
# and TX parameter SPECIES/STRAIN is missing or invalid (not CT value) and DM.SPECIES/STRAIN is
# missing or invalid (not CT value)
# - Different values of SPECIES/STRAIN across TS, TX and DM for studies where no or only one
# TS parameter SPECIES/STRAIN is registered
# - Multiple TS parameter SPECIES/STRAIN values are registered for study and TX parameter
# SPECIES/STRAIN and/or DM.SPECIES/STRAIN do not match any of the TS values.
# - Multiple TS parameter SPECIES/STRAIN values are registered for study and TX parameter
# SPECIES/STRAIN and DM.SPECIES/STRAIN are unequal.
# Non-empty UNCERTAIN_MSG values are merged with non-empty UNCERTAIN_MSG values
# which may exist in the input set of animals (animalList).
#
# Input : - The TS, TX and DM domains - are imported from the pooled SEND data store if
# the don't exist in workspace.
# - A data tables specified in the input parameters animalList:
# It contains the list of animals to filter for specified species and strain value(s)
# - must contain these character variables:
# STUDYID
# USUBJID
# other variables may be included
# - CDISC CT code lists SPECIES and STRAIN imported from a CDISC CT file.
#
# Output : A data table with the character columns:
# STUDYID
# USUBJID
# SPECIES
# STRAIN (if strainFilter has been specified)
# UNCERTAIN_MSG - if input parameter inclUncertain flag is enabled
# plus any additional columns which may be included in the input data animalList
#
# Parameters : animalList: Mandatory, data table (see Input).
# speciesFilter: Mandatory, character.
# The species value(s) to use as criterion for filtering of the input data table.
# It can be a single string, a vector or a list of multiple strings.
# strainFilter: Optional, character.
# The species value(s) to use as criterion for filtering of the input data table.
# It can be a single string, a vector or a list of multiple strings.
# Only allowed when a single value is specified for speciesFilter.
# inclUncertain: Optional, Include uncertain rows or not
#
###################################################################################
library(data.table)
FilterAnimalsSpeciesStrain<-function(animalList=NULL, speciesFilter=NULL, strainFilter=NULL, inclUncertain=FALSE, exclusively=FALSE) {
##################################################################################################################
#### Function to identify uncertain animals at species level
identifyUncertainSPECIES<-function(SPECIES, SPECIES_TS, SPECIES_TX, SPECIES_DM, ALL_SPECIES_TS, NUM_SPECIES_TS, NUM_ANIMALS) {
msgArr<-c()
if (is.na(SPECIES))
msgArr<-c(msgArr, 'TS and TX parameters SPECIES and DM.SPECIES are all missing')
else {
if (NUM_ANIMALS > NUM_SPECIES_TS)
msgArr<-c(msgArr, 'TX parameter SPECIES included multiple times for the SET')
else {
if (! SPECIES %in% ctSPECIES) {
if (!is.na(SPECIES_DM) & ! SPECIES_DM %in% ctSPECIES)
msgArr<-c(msgArr, 'DM.SPECIES does not contain a valid CT value')
else if (!is.na(SPECIES_TX) & ! SPECIES_TX %in% ctSPECIES)
msgArr<-c(msgArr, 'TX parameter SPECIES does not contain a valid CT value')
else if (!is.na(SPECIES_TS) & ! SPECIES_TS %in% ctSPECIES)
msgArr<-c(msgArr, 'TS parameter SPECIES does not contain a valid CT value')
}
if (NUM_SPECIES_TS == 1 & length(unique(na.omit(c(SPECIES_TS, SPECIES_TX, SPECIES_DM)))) > 1)
msgArr<-c(msgArr, 'Mismatch in values of TS and/or TX parameters SPECIES and/or DM.SPECIES')
else
if (NUM_SPECIES_TS > 1 & ((!SPECIES %in% ALL_SPECIES_TS) | (!is.na(SPECIES_TX) & !is.na(SPECIES_DM) & SPECIES_TX != SPECIES_DM)))
msgArr<-c(msgArr, 'Mismatch in values of TS and/or TX parameters SPECIES and/or DM.SPECIES')
}
}
msg<-paste(msgArr, collapse = ' & ')
return(ifelse(msg=="", as.character(NA), msg))
}
#### Function to identify uncertain animals at strain level
identifyUncertainSTRAIN<-function(STRAIN, STRAIN_TS, STRAIN_TX, STRAIN_DM, ALL_STRAIN_TS, NUM_STRAIN_TS, NUM_ANIMALS) {
msgArr<-c()
if (is.na(STRAIN))
msgArr<-c(msgArr, 'TS and TX parameters STRAIN and DM.STRAIN are all missing')
else {
if (NUM_ANIMALS > NUM_STRAIN_TS)
msgArr<-c(msgArr, 'TX parameter STRAIN included multiple times for the SET')
else {
if (! STRAIN %in% ctSTRAIN) {
if (!is.na(STRAIN_DM) & ! STRAIN_DM %in% ctSTRAIN)
msgArr<-c(msgArr, 'DM.STRAIN does not contain a valid CT value')
else if (!is.na(STRAIN_TX) & ! STRAIN_TX %in% ctSTRAIN)
msgArr<-c(msgArr, 'TX parameter STRAIN does not contain a valid CT value')
else if (!is.na(STRAIN_TS) & ! STRAIN_TS %in% ctSTRAIN)
msgArr<-c(msgArr, 'TS parameter STRAIN does not contain a valid CT value')
}
if (NUM_STRAIN_TS == 1 & length(unique(na.omit(c(STRAIN_TS, STRAIN_TX, STRAIN_DM)))) > 1)
msgArr<-c(msgArr, 'Mismatch in values of TS and/or TX parameters STRAIN and/or DM.STRAIN')
else
if (NUM_STRAIN_TS > 1 & ((!STRAIN %in% ALL_STRAIN_TS) | (!is.na(STRAIN_TX) & !is.na(STRAIN_DM) & STRAIN_TX != STRAIN_DM)))
msgArr<-c(msgArr, 'Mismatch in values of TS and/or TX parameters STRAIN and/or DM.STRAIN')
}
}
msg<-paste(msgArr, collapse = ' & ')
return(ifelse(msg=="", as.character(NA), msg))
}
##################################################################################################################
if (is.null(animalList) | isTRUE(is.na(animalList)) | isTRUE(animalList=="")) {
stop("animalList must be be specified with a data table")
}
if (is.null(speciesFilter) | isTRUE(is.na(speciesFilter)) | isTRUE(speciesFilter=="")) {
stop("speciesFilter must be specified")
}
InclStrainFilter<-FALSE
if (!(is.null(strainFilter) | isTRUE(is.na(strainFilter)) | isTRUE(strainFilter==""))) {
# check if strainFilter is allowed...
if (length(speciesFilter) > 1) {
stop("Not possible to handle strainFilter when speciesFilter contains multiple values")
}
InclStrainFilter<-TRUE
}
if (!(inclUncertain %in% c(TRUE,FALSE))) {
stop("Parameter inclUncertain must be either TRUE or FALSE")
}
if (!(exclusively %in% c(TRUE,FALSE))) {
stop("Parameter Exclusively must be either TRUE or FALSE")
}
# List of studyid values included in the input table of animals
animalStudies<-unique(animalList[,.(STUDYID)])
# import TS, TX and DM if they not already exists
if (!exists("TS")) {
importSENDDomains(c("TS"))
}
if (!exists("TX")) {
importSENDDomains(c("TX"), animalStudies)
}
if (!exists("DM")) {
importSENDDomains(c("DM"), animalStudies)
}
##################################################################################################################
# 1. First we look into the SPECIES level (whether or not strainFilter has been specified)
# Get values of code list SPECIES from CDISC CT
ctSPECIES<-getCTCodListValues("SPECIES")
# Extract all TS rows for parameter SPECIES, rename TSVAL to SPECIES_TS
# - remove duplicates
# - limit to the set of studies for input set of animals
tsSPECIESall<-merge(unique(TS[TSPARMCD == 'SPECIES', .(STUDYID,SPECIES_TS=toupper(trimws(TSVAL)))]), animalStudies, by='STUDYID')
# Add studies with no TS parameter SPECIES from the set of studies for input set of animals
tsSPECIESall<-
rbindlist(list(tsSPECIESall,
fsetdiff(animalStudies,
tsSPECIESall[,.(STUDYID)])[!is.na(STUDYID),.(STUDYID, SPECIES_TS=as.character(NA))]),
use.names=TRUE, fill=TRUE)
# Add variables with
# - count of number of distinct SPECIES per study
# - concatenation of all species per study (for studies with one species this is equal to SPECIES_TS)
tsSPECIESall[, `:=` (ALL_SPECIES = unique(SPECIES_TS), NUM_SPECIES_TS = .N), by = STUDYID]
tsSPECIESall[,`:=`(ALL_SPECIES_TS = c(.SD)), by = STUDYID, .SDcols='SPECIES_TS']
# JOin the list of studies/species with DM to get all animal level SPECIES
# - the list will contain STUDYID/USUBJID duplicates for studies with multiple SPECIES registered in TS
animalSPECIESall<-
merge(merge(tsSPECIESall,
animalList[,.(STUDYID, USUBJID)],
by='STUDYID', allow.cartesian = TRUE),
DM[,.(STUDYID, USUBJID, SETCD, SPECIES_DM=ifelse(SPECIES=="",as.character(NA),toupper(trimws(SPECIES))))],
by=c('STUDYID','USUBJID'), allow.cartesian = TRUE)
# Join the list of studies/animals/species with TX to get all set level SPECIES
# - add variable SPECIES with the first non-empty species value from DM, TX or TS
# - add variable with count of unique USUBJID per study (there is expected to be one usubjid per studyid per TSPARMCD 'SPECIES' )
animalSPECIESall<-
merge(animalSPECIESall,
unique(TX[TXPARMCD=='SPECIES',.(STUDYID,SETCD,SPECIES_TX=toupper(trimws(TXVAL)))]),
by=c('STUDYID','SETCD'), all.x=TRUE )[,`:=` (SPECIES=fcoalesce(SPECIES_DM,SPECIES_TX,SPECIES_TS))]
animalSPECIESall[, `:=` (NUM_ANIMALS = .N), by = .(STUDYID, USUBJID)]
# Identify uncertain animals - add variable UNCERTAIN_MSG
animalSPECIESall[,`:=` (UNCERTAIN_MSG=mapply(identifyUncertainSPECIES, SPECIES, SPECIES_TS, SPECIES_TX, SPECIES_DM, ALL_SPECIES_TS, NUM_SPECIES_TS, NUM_ANIMALS ))]
# Extract unique set of animals and related species - exclude uncertain animals
animalSPECIESallUniq<-unique(animalSPECIESall[is.na(UNCERTAIN_MSG), .(STUDYID, USUBJID, SPECIES)])
# Extract animals match
foundAnimalsSPECIES<-unique(animalSPECIESallUniq[SPECIES %in% speciesFilter, .(STUDYID, USUBJID, SPECIES)])
if (exclusively) {
# Find studies with animals having other SPECIES than the requested
foundAnimalsSPECIES<-
merge(foundAnimalsSPECIES,
# Set of studies to keep:
fsetdiff(unique(foundAnimalsSPECIES[,.(STUDYID)]),
# Set of studies (included in the found set of animals with matching SPECIES values) with possible
# SPECIES values not included in the speciesFilter:
unique(fsetdiff(merge(# Set of possible SPECIES values per study in the input set of animals:
unique(animalSPECIESallUniq[,.(STUDYID, SPECIES)]),
unique(foundAnimalsSPECIES[,.(STUDYID)]), by='STUDYID'),
unique(foundAnimalsSPECIES[,.(STUDYID, SPECIES)]))[,.(STUDYID)])),
by='STUDYID')
}
if (!InclStrainFilter) {
# No strainFilter has been specified
# The complete set of animals to return - add the uncertain animals
if (inclUncertain)
# Add the uncertain animals
foundAnimals<-rbindlist(list(foundAnimalsSPECIES,
unique(animalSPECIESall[!is.na(UNCERTAIN_MSG), .(STUDYID, USUBJID, SPECIES, UNCERTAIN_MSG)])),
use.names=TRUE, fill=TRUE)
else
foundAnimals<-foundAnimalsSPECIES
}
else {
##########################################################################################################################
# 2. strainFilter has been specified - limit set of animals extracted at SPECIES level to specified STRAIN(s)
# Get values of code list STRAIN from CDISC CT
ctSTRAIN<-getCTCodListValues("STRAIN")
# List of studyid values included in the list of animals extracted at SPECIES level
animalSPECIESStudies<-unique(foundAnimalsSPECIES[,.(STUDYID)])
# Extract all TS rows for parameter STRAIN, rename TSVAL to STRAIN_TS
# - remove duplicates
# - limit to the set of studies for input set of animals
tsSTRAINall<-merge(unique(TS[TSPARMCD == 'STRAIN', .(STUDYID,STRAIN_TS=toupper(trimws(TSVAL)))]), animalSPECIESStudies, by='STUDYID')
# Add studies with no TS parameter STRAIN from the set of studies for input set of animals
tsSTRAINall<-
rbindlist(list(tsSTRAINall,
fsetdiff(animalSPECIESStudies,
tsSTRAINall[,.(STUDYID)])[!is.na(STUDYID),.(STUDYID, STRAIN_TS=as.character(NA))]),
use.names=TRUE, fill=TRUE)
# Add variables with
# - count of number of distinct STRAIN per study
# - concatenation of all strain per study (for studies with one strain this is equal to STRAIN_TS)
tsSTRAINall[, `:=` (ALL_STRAIN = unique(STRAIN_TS), NUM_STRAIN_TS = .N), by = STUDYID]
tsSTRAINall[,`:=`(ALL_STRAIN_TS = c(.SD)), by = STUDYID, .SDcols='STRAIN_TS']
# JOin the set of studies/strain with DM to get all animal level STRAIN limited to the found set of animals at species level
# - the set will contain STUDYID/USUBJID duplicates for studies with multiple STRAIN registered in TS
animalSTRAINall<-
merge(merge(tsSTRAINall,
animalList[,.(STUDYID, USUBJID)],
by='STUDYID'),
merge(foundAnimalsSPECIES[,.(STUDYID,USUBJID)],
DM[,.(STUDYID, USUBJID, SETCD, STRAIN_DM=ifelse(STRAIN=="",as.character(NA),toupper(trimws(STRAIN))))],
by=c('STUDYID','USUBJID')),
by=c('STUDYID','USUBJID'))
# Join the list of studies/animals/strain with TX to get all set level STRAIN
# - add variable STRAIN with the first non-empty strain value from DM, TX or TS
# - add variable with count of unique USUBJID per study (there is expected to be one usubjid per studyid per TSPARMCD 'STRAIN' )
animalSTRAINall<-
merge(animalSTRAINall,
unique(TX[TXPARMCD=='STRAIN',.(STUDYID,SETCD,STRAIN_TX=toupper(trimws(TXVAL)))]),
by=c('STUDYID','SETCD'), all.x=TRUE )[,`:=` (STRAIN=fcoalesce(STRAIN_DM,STRAIN_TX,STRAIN_TS))]
animalSTRAINall[, `:=` (NUM_ANIMALS = .N), by = .(STUDYID, USUBJID)]
# Identify uncertain animals - add variable UNCERTAIN_MGS
animalSTRAINall[,`:=` (UNCERTAIN_MSG=mapply(identifyUncertainSTRAIN, STRAIN, STRAIN_TS, STRAIN_TX, STRAIN_DM, ALL_STRAIN_TS, NUM_STRAIN_TS, NUM_ANIMALS ))]
# Extract unique set of animals and related strain - exclude uncertain animals
animalSTRAINallUniq<-unique(animalSTRAINall[is.na(UNCERTAIN_MSG), .(STUDYID, USUBJID, STRAIN)])
# Extract animals match
foundAnimalsSTRAIN<-unique(animalSTRAINallUniq[STRAIN %in% strainFilter, .(STUDYID, USUBJID, STRAIN)])
if (exclusively) {
# Find studies with animals having other STRAIN than the requested
foundAnimalsSTRAIN<-
merge(foundAnimalsSTRAIN,
# Set of studies to keep:
fsetdiff(unique(foundAnimalsSTRAIN[,.(STUDYID)]),
# Set of studies (included in the found set of animals with matching STRAIN values) with possible
# STRAIN values not included in the strainFilter:
unique(fsetdiff(merge(# Set of possible STRAIN values per study in the input set of animals:
unique(animalSTRAINallUniq[,.(STUDYID, STRAIN)]),
unique(foundAnimalsSTRAIN[,.(STUDYID)]), by='STUDYID'),
unique(foundAnimalsSTRAIN[,.(STUDYID, STRAIN)]))[,.(STUDYID)])),
by='STUDYID')
}
# Merge set of animals with the set of animals extracted at SPECIES level to get the SPECIES values
foundAnimals<-merge(foundAnimalsSPECIES,
foundAnimalsSTRAIN,
by=c('STUDYID','USUBJID'))
if (inclUncertain) {
# Extract and merge the sets of uncertain animals identified at species and/or strain level
uncertainAnimals<-
merge(unique(animalSPECIESall[!is.na(UNCERTAIN_MSG), .(STUDYID, USUBJID, SPECIES, UNCERTAIN_MSG)]),
unique(animalSTRAINall[!is.na(UNCERTAIN_MSG), .(STUDYID, USUBJID, STRAIN, UNCERTAIN_MSG)]),
by=c('STUDYID','USUBJID'), all = TRUE)
# Merge the UNCERTAIN_MSG from each tables into one column
# - non-empty messages are separated by ' & '
# - The function name is included as first part of the combined texts
# - exclude the original UNCERTAIN_MSG columns after the merge
uncertainAnimals[,`:=` (UNCERTAIN_MSG=paste('FilterAnimalsSpeciesStrain: ',
ifelse(!is.na(UNCERTAIN_MSG.x) & !is.na(UNCERTAIN_MSG.y),
paste(UNCERTAIN_MSG.y, UNCERTAIN_MSG.x, sep=' & '),
fcoalesce(UNCERTAIN_MSG.x, UNCERTAIN_MSG.y))))][, `:=` (UNCERTAIN_MSG.x=NULL,UNCERTAIN_MSG.y=NULL)]
# Add the set of uncertain animals to the set of found animals
foundAnimals<-rbindlist(list(foundAnimals,uncertainAnimals), use.names=TRUE, fill=TRUE)
}
}
##################################################################################################################
# Handling of the final set of animals to return
# Merge the set of extracted animals with the input set of animals to keep
# any additional columns from the input table
foundAnimals<-merge(foundAnimals, animalList, by=c('STUDYID', 'USUBJID'))
if (inclUncertain)
if ("UNCERTAIN_MSG.y" %in% names(foundAnimals)) {
# An UNCERTAIN_MSG column is included in both input and found list of animals
# - merge the UNCERTAIN_MSG from each of the merged tables into one column
# - non-empty messages are separated by '|'
# - exclude the original UNCERTAIN_MSG columns after the merge
foundAnimals<-foundAnimals[,`:=` (UNCERTAIN_MSG=ifelse(!is.na(UNCERTAIN_MSG.x) & !is.na(UNCERTAIN_MSG.y),
paste(UNCERTAIN_MSG.y, UNCERTAIN_MSG.x, sep='|'),
fcoalesce(UNCERTAIN_MSG.x, UNCERTAIN_MSG.y)))][, `:=` (UNCERTAIN_MSG.x=NULL,UNCERTAIN_MSG.y=NULL)]
}
# Return list of found animals
return(foundAnimals)
}