Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 50 additions & 48 deletions R/dataProcess.R
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ dataProcess = function(
"== Start the summarization per subplot...")
getOption("MSstatsMsg")("INFO",
" == Start the summarization per subplot...")

processed = getProcessed(input)
input = MSstatsPrepareForSummarization(input, summaryMethod, MBimpute, censoredInt,
remove_uninformative_feature_outlier)
Expand Down Expand Up @@ -210,7 +210,12 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym
remove50missing, equal_variance, numberOfCores = 1,
aft_iterations = 90) {
if (numberOfCores > 1) {
protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
is_labeled_reference = "ref" %in% colnames(input) && any(input$ref, na.rm = TRUE)
if (is_labeled_reference) {
protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
} else {
protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN, input$LABEL))
}
num_proteins = length(protein_indices)
function_environment = environment()
cl = parallel::makeCluster(numberOfCores)
Expand Down Expand Up @@ -253,9 +258,9 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym
parallel::stopCluster(cl)
return(summarized_results)
} else {
return(MSstatsSummarizeWithSingleCore(input, method, impute,
censored_symbol,
remove50missing,
return(MSstatsSummarizeWithSingleCore(input, method, impute,
censored_symbol,
remove50missing,
equal_variance,
aft_iterations))
}
Expand Down Expand Up @@ -294,7 +299,12 @@ MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol
remove50missing, equal_variance, aft_iterations = 90) {


protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
is_labeled_reference = "ref" %in% colnames(input) && any(input$ref, na.rm = TRUE)
if (is_labeled_reference) {
protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
} else {
protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN, input$LABEL))
}
num_proteins = length(protein_indices)
summarized_results = vector("list", num_proteins)
if (method == "TMP") {
Expand Down Expand Up @@ -366,7 +376,7 @@ MSstatsSummarizeSingleLinear = function(single_protein,

cols = intersect(
colnames(single_protein),
c("newABUNDANCE", "cen", "RUN", "FEATURE", "ref")
c("newABUNDANCE", "cen", "RUN", "FEATURE", "ref_covariate")
)

single_protein = single_protein[
Expand All @@ -383,33 +393,25 @@ MSstatsSummarizeSingleLinear = function(single_protein,

if (impute & any(single_protein[["censored"]])) {
survival_fit = .fitSurvival(
single_protein[LABEL == "L", cols, with = FALSE],
single_protein[, cols, with = FALSE],
aft_iterations
)
sigma2 = survival_fit$scale^2

single_protein[, c("predicted", "imputation_var") := {
pred = predict(survival_fit, newdata = .SD, se.fit = TRUE)
list(pred$fit, pred$se.fit^2 + sigma2)
}]

single_protein[, predicted := ifelse(
censored & (LABEL == "L"),
predicted,
NA
)]
single_protein[, newABUNDANCE := ifelse(
censored & LABEL == "L",
predicted,
newABUNDANCE
)]

survival = single_protein[, c(cols, "predicted"), with = FALSE]

single_protein[, predicted := ifelse(censored, predicted, NA)]
single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]

survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
} else {
survival = single_protein[, cols, with = FALSE]
survival = single_protein[, intersect(c(cols, "LABEL"), colnames(single_protein)), with = FALSE]
survival[, predicted := NA]
}

if (all(!is.na(single_protein$ANOMALYSCORES))) {
single_protein[, weights :=
anomaly_weights_z_vec(ANOMALYSCORES),
Expand All @@ -432,14 +434,12 @@ MSstatsSummarizeSingleLinear = function(single_protein,
is_single_feature = .checkSingleFeature(single_protein)

if (is_single_feature) {
result = single_protein[LABEL == "L",
.(LogIntensities = mean(newABUNDANCE)),
by = RUN]
result = single_protein[, .(LogIntensities = mean(newABUNDANCE)), by = RUN]
result[, Protein := unique(single_protein$PROTEIN)]
result[, LABEL := unique(single_protein$LABEL)]
result[, Variance := NA_real_]
setcolorder(result, c("Protein", "RUN", "LogIntensities",
"Variance"))

setcolorder(result, c("Protein", "RUN", "LogIntensities", "Variance"))

return(list(result, survival))
} else {
counts = xtabs(
Expand All @@ -465,13 +465,13 @@ MSstatsSummarizeSingleLinear = function(single_protein,
cf = summary(fit)$coefficients[, 1]
cov_mat = vcov(fit)

result = unique(single_protein[, .(Protein = PROTEIN,
RUN = RUN)])
result = unique(single_protein[, .(Protein = PROTEIN, RUN = RUN)])
extracted_values = get_linear_summary(single_protein, cf,
counts, label, cov_mat)
result = cbind(result, extracted_values)
result[, LABEL := unique(single_protein$LABEL)]
}

return(list(result, survival))
}
}
Expand Down Expand Up @@ -513,7 +513,7 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
newABUNDANCE = n_obs = n_obs_run = RUN = FEATURE = LABEL = NULL
predicted = censored = NULL
cols = intersect(colnames(single_protein), c("newABUNDANCE", "cen", "RUN",
"FEATURE", "ref"))
"FEATURE", "ref_covariate"))
single_protein = single_protein[(n_obs > 1 & !is.na(n_obs)) &
(n_obs_run > 0 & !is.na(n_obs_run))]
if (nrow(single_protein) == 0) {
Expand All @@ -522,44 +522,46 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
single_protein[, RUN := factor(RUN)]
single_protein[, FEATURE := factor(FEATURE)]
if (impute & any(single_protein[["censored"]])) {

# Flag to track convergence warning
converged = TRUE

# Try to fit survival model and catch convergence warnings
survival_fit = withCallingHandlers({
.fitSurvival(single_protein[LABEL == "L", cols, with = FALSE],
aft_iterations)
.fitSurvival(single_protein[, cols, with = FALSE], aft_iterations)
}, warning = function(w) {
if (grepl("converge", conditionMessage(w), ignore.case = TRUE)) {
message("Convergence warning caught: ", conditionMessage(w))
converged <<- FALSE
}
})

if (converged) {
single_protein[, predicted := predict(survival_fit, newdata = .SD)]
} else {
single_protein[, predicted := NA_real_]
}

single_protein[, predicted := ifelse(censored & (LABEL == "L"), predicted, NA)]
single_protein[, newABUNDANCE := ifelse(censored & LABEL == "L",
predicted, newABUNDANCE)]
survival = single_protein[, c(cols, "predicted"), with = FALSE]

single_protein[, predicted := ifelse(censored, predicted, NA)]
single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]
survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
} else {
survival = single_protein[, cols, with = FALSE]
survival = single_protein[, intersect(c(cols, "LABEL"), colnames(single_protein)), with = FALSE]
survival[, predicted := NA]
}

single_protein = .isSummarizable(single_protein, remove50missing)
if (is.null(single_protein)) {
return(list(NULL, NULL))
} else {
single_protein = single_protein[!is.na(newABUNDANCE), ]
is_labeled = nlevels(single_protein$LABEL) > 1
result = .runTukey(single_protein, is_labeled, censored_symbol,
is_labeled_reference = "ref" %in% colnames(single_protein) &&
any(single_protein$ref, na.rm = TRUE)
result = .runTukey(single_protein, is_labeled_reference, censored_symbol,
remove50missing)
if (!is.null(result) && !is.element("LABEL", colnames(result))) {
result[, LABEL := "L"]
}
}
list(result, survival)
}
35 changes: 22 additions & 13 deletions R/dataProcessPlots.R
Original file line number Diff line number Diff line change
Expand Up @@ -261,15 +261,15 @@ dataProcessPlots = function(
labels = seq(1, length(unique(RUN))))]
summarized[, RUN := as.numeric(RUN)]

## Meena :due to GROUP=0 for labeled.. extra care required.
has_heavy_prof = any(processed$LABEL == "H")
heavy_has_real_group_prof = has_heavy_prof && !all(processed[LABEL == "H", GROUP] %in% c("0", NA))
tempGroupName = unique(processed[, c("GROUP", "RUN")])
if (length(unique(processed$LABEL)) == 2) {
if (length(unique(processed$LABEL)) == 2 && !heavy_has_real_group_prof) {
tempGroupName = tempGroupName[GROUP != '0']
}
tempGroupName = tempGroupName[order(RUN), ] ## Meena : should we order by GROUP or RUN? I guess by RUn, because x-axis is by RUN
}
tempGroupName = tempGroupName[order(RUN), ]
level.group = as.character(unique(tempGroupName$GROUP))
tempGroupName$GROUP = factor(tempGroupName$GROUP,
levels = level.group) ## Meena : factor GROUP again, due to 1, 10, 2, ... if you have better way, please change
tempGroupName$GROUP = factor(tempGroupName$GROUP, levels = level.group)

groupAxis = as.numeric(xtabs(~GROUP, tempGroupName))
cumGroupAxis = cumsum(groupAxis)
Expand All @@ -292,7 +292,11 @@ dataProcessPlots = function(


if (length(unique(processed$LABEL)) == 2) {
processed[, LABEL := factor(LABEL, labels = c("Reference", "Endogenous"))]
if (heavy_has_real_group_prof) {
processed[, LABEL := factor(LABEL, labels = c("Heavy", "Light"))]
} else {
processed[, LABEL := factor(LABEL, labels = c("Reference", "Endogenous"))]
}
} else {
if (unique(processed$LABEL) == "L") {
processed[, LABEL := factor(LABEL, labels = c("Endogenous"))]
Expand Down Expand Up @@ -442,8 +446,14 @@ dataProcessPlots = function(
processed[, RUN := factor(RUN, levels = unique(processed$RUN),
labels = seq(1, data.table::uniqueN(processed$RUN)))]

has_heavy = any(processed$LABEL == "H")
heavy_has_real_group = has_heavy && !all(processed[LABEL == "H", GROUP] %in% c("0", NA))
if (length(unique(processed$LABEL)) == 2) {
processed[, LABEL := factor(LABEL, labels = c("Reference", "Endogenous"))]
if (heavy_has_real_group) {
processed[, LABEL := factor(LABEL, labels = c("Heavy", "Light"))]
} else {
processed[, LABEL := factor(LABEL, labels = c("Reference", "Endogenous"))]
}
label.color = c("darkseagreen1", "lightblue")
} else {
if (unique(processed$LABEL) == "L") {
Expand All @@ -454,14 +464,13 @@ dataProcessPlots = function(
label.color = c("darkseagreen1")
}
}

processed = processed[order(LABEL, GROUP, SUBJECT)]

## Meena :due to GROUP=0 for labeled.. extra care required.

tempGroupName = unique(processed[, list(GROUP, RUN)])
if (length(unique(processed$LABEL)) == 2) {
if (length(unique(processed$LABEL)) == 2 && !heavy_has_real_group) {
tempGroupName = tempGroupName[GROUP != '0']
}
}
tempGroupName = tempGroupName[order(RUN), ] ## Meena : should we order by GROUP or RUN? I guess by RUn, because x-axis is by RUN
level.group = as.character(unique(tempGroupName$GROUP))
tempGroupName$GROUP = factor(tempGroupName$GROUP,
Expand Down
46 changes: 22 additions & 24 deletions R/utils_censored.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,49 +31,47 @@ MSstatsHandleMissing = function(input, summary_method, impute,

if (impute & !is.null(missing_symbol)) {
input$censored = FALSE
use_for_analysis = if ("ref" %in% colnames(input)) !input$ref else rep(TRUE, nrow(input))
## if intensity = 1, but abundance > cutoff after normalization, it also should be censored.
if (!is.null(censored_cutoff)) {
quantiles = input[!is.na(INTENSITY) & INTENSITY > 1 & LABEL == "L",
quantile(ABUNDANCE,
prob = c(0.01, 0.25, 0.5, 0.75,
censored_cutoff),
quantiles = input[use_for_analysis & !is.na(INTENSITY) & INTENSITY > 1,
quantile(ABUNDANCE,
prob = c(0.01, 0.25, 0.5, 0.75,
censored_cutoff),
na.rm = TRUE)]
iqr = quantiles[4] - quantiles[2]
multiplier = (quantiles[5] - quantiles[4]) / iqr
cutoff_lower = (quantiles[2] - multiplier * iqr)
input$censored = !is.na(input$INTENSITY) &
input$LABEL == "L" &
cutoff_lower = (quantiles[2] - multiplier * iqr)
input$censored = use_for_analysis & !is.na(input$INTENSITY) &
input$ABUNDANCE < cutoff_lower
if (cutoff_lower <= 0 & !is.null(missing_symbol) & missing_symbol == "0") {
zero_one_filter = !is.na(input$ABUNDANCE) & input$ABUNDANCE <= 0
zero_one_filter = use_for_analysis & !is.na(input$ABUNDANCE) & input$ABUNDANCE <= 0
input$censored = ifelse(zero_one_filter, TRUE, input$censored)
}
if (!is.null(missing_symbol) & missing_symbol == "NA") {
input$censored = ifelse(is.na(input$INTENSITY), TRUE,
input$censored = ifelse(use_for_analysis & is.na(input$INTENSITY), TRUE,
input$censored)
}
msg = paste('** Log2 intensities under cutoff =',
format(cutoff_lower, digits = 5),

msg = paste('** Log2 intensities under cutoff =',
format(cutoff_lower, digits = 5),
' were considered as censored missing values.')
msg_2 = paste("** Log2 intensities =", missing_symbol, "were considered as censored missing values.")

getOption("MSstatsMsg")("INFO", msg)
getOption("MSstatsMsg")("INFO", msg_2)

getOption("MSstatsLog")("INFO", msg)
getOption("MSstatsLog")("INFO", msg_2)

} else {
if (missing_symbol == '0') {
input$censored = input$LABEL == "L" &
!is.na(input$INTENSITY) &
input$censored = use_for_analysis & !is.na(input$INTENSITY) &
(input$INTENSITY == 1 | input$ABUNDANCE <= 0)
} else if (missing_symbol == 'NA') {
input$censored = input$LABEL == "L" & is.na(input$ABUNDANCE)
input$censored = use_for_analysis & is.na(input$ABUNDANCE)
}
}
input[, censored := ifelse(LABEL == "H", FALSE, censored)]
} else {
input$censored = FALSE
}
Expand Down Expand Up @@ -131,13 +129,13 @@ MSstatsHandleMissing = function(input, summary_method, impute,
if (impute) {
if (!is.null(censored_symbol)) {
if (censored_symbol == "0") {
nonmissing_filter = input$LABEL == "L" & !is.na(input$newABUNDANCE) & input$newABUNDANCE != 0
nonmissing_filter = !is.na(input$newABUNDANCE) & input$newABUNDANCE != 0
} else if (censored_symbol == "NA") {
nonmissing_filter = input$LABEL == "L" & !is.na(input$newABUNDANCE)
}
}
nonmissing_filter = !is.na(input$newABUNDANCE)
}
}
} else {
nonmissing_filter = input$LABEL == "L" & !is.na(input$newABUNDANCE) & input$newABUNDANCE != 0
nonmissing_filter = !is.na(input$newABUNDANCE) & input$newABUNDANCE != 0
}
nonmissing_filter
}
21 changes: 13 additions & 8 deletions R/utils_checks.R
Original file line number Diff line number Diff line change
Expand Up @@ -231,12 +231,17 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
}
input$PEPTIDE = paste(input$PEPTIDESEQUENCE, input$PRECURSORCHARGE, sep = "_")
input$TRANSITION = paste(input$FRAGMENTION, input$PRODUCTCHARGE, sep = "_")
input$ISOTOPELABELTYPE = factor(input$ISOTOPELABELTYPE)
if (data.table::uniqueN(input$ISOTOPELABELTYPE) == 2) {
levels(input$ISOTOPELABELTYPE) = c("H", "L")
} else {
levels(input$ISOTOPELABELTYPE) = "L"
}
label_map <- c(
"H" = "H",
"L" = "L",
"heavy" = "H",
"light" = "L",
"NA" = "NA"
)
input$ISOTOPELABELTYPE <- factor(
label_map[as.character(input$ISOTOPELABELTYPE)],
levels = c("H", "L", "NA")
)
input
}

Expand Down Expand Up @@ -287,8 +292,8 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess)
skip_absent = TRUE)

input[, FEATURE := paste(PEPTIDE, TRANSITION, sep = "_")]
input[, GROUP := ifelse(LABEL == "L", GROUP_ORIGINAL, "0")]
input[, SUBJECT := ifelse(LABEL == "L", SUBJECT_ORIGINAL, "0")]
input[, GROUP := GROUP_ORIGINAL]
input[, SUBJECT := SUBJECT_ORIGINAL]

cols = c("PROTEIN", "PEPTIDE", "TRANSITION", "FEATURE", "LABEL",
"GROUP_ORIGINAL", "SUBJECT_ORIGINAL", "RUN", "GROUP",
Expand Down
Loading
Loading