Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 17 additions & 191 deletions R/annotateProteinInfoFromIndra.R
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
```r
#' Annotate Protein Information from Indra
#'
#' This function annotates a data frame with protein information from Indra.
#' @description This function enriches a data frame with additional protein information sourced from Indra. It appends details such as Uniprot IDs, HGNC IDs, HGNC names, and flags indicating if the protein is a transcription factor, kinase, or phosphatase.
#'
#' @param df output of \code{\link[MSstats]{groupComparison}} function's
#' comparisonResult table, which contains a list of proteins and their
#' corresponding p-values, logFCs, along with additional HGNC ID and HGNC
#' name columns
#' @param proteinIdType A character string specifying the type of protein ID.
#' It can be either "Uniprot", "Uniprot_Mnemonic", or "Hgnc_Name".
#' @return A data frame with the following columns:
#' @param df \code{data.frame}. The input data frame should be the output of the \code{\link[MSstats]{groupComparison}} function's comparisonResult table. It must contain a list of proteins with their respective p-values, log fold changes (logFCs), and additional columns for HGNC ID and HGNC name.
#' @param proteinIdType \code{character}. Specifies the type of protein ID used in the input data frame. Acceptable values are "Uniprot", "Uniprot_Mnemonic", or "Hgnc_Name".
#'
#' @return \code{data.frame}. The function returns a data frame with the following columns:
#' \describe{
#' \item{Protein}{Character. The original protein identifier.}
#' \item{UniprotID}{Character. The Uniprot ID of the protein.}
#' \item{HgncID}{Character. The HGNC ID of the protein.}
#' \item{HgncName}{Character. The HGNC name of the protein.}
#' \item{IsTranscriptionFactor}{Logical. Indicates if the protein is a transcription factor.}
#' \item{IsKinase}{Logical. Indicates if the protein is a kinase.}
#' \item{IsPhosphatase}{Logical. Indicates if the protein is a phosphatase.}
#' \item{Protein}{\code{character}. The original protein identifier from the input data frame.}
#' \item{UniprotID}{\code{character}. The Uniprot ID associated with the protein.}
#' \item{HgncID}{\code{character}. The HGNC ID associated with the protein.}
#' \item{HgncName}{\code{character}. The HGNC name associated with the protein.}
#' \item{IsTranscriptionFactor}{\code{logical}. Indicates whether the protein functions as a transcription factor.}
#' \item{IsKinase}{\code{logical}. Indicates whether the protein functions as a kinase.}
#' \item{IsPhosphatase}{\code{logical}. Indicates whether the protein functions as a phosphatase.}
#' }
#'
#' @examples
#' # Example usage of annotateProteinInfoFromIndra
#' df <- data.frame(Protein = c("CLH1_HUMAN"))
#' annotated_df <- annotateProteinInfoFromIndra(df, "Uniprot_Mnemonic")
#' head(annotated_df)
#'
#' @export
annotateProteinInfoFromIndra <- function(df, proteinIdType) {
.validateAnnotateProteinInfoFromIndraInput(df)
Expand All @@ -33,179 +34,4 @@ annotateProteinInfoFromIndra <- function(df, proteinIdType) {
df <- .populatePhophataseInfoInDataFrame(df)
return(df)
}

#' Validate Annotate Protein Info Input
#'
#' This function validates the input data frame for the annotateProteinInfoFromIndra function.
#'
#' @param df A data frame containing protein information.
#' @return None. Throws an error if validation fails.
.validateAnnotateProteinInfoFromIndraInput <- function(df) {
if (!"Protein" %in% colnames(df)) {
stop("Input dataframe must contain 'Protein' column.")
}
}

#' Populate Uniprot IDs in Data Frame
#'
#' This function populates the Uniprot IDs in the data frame based on the protein ID type.
#'
#' @param df A data frame containing protein information.
#' @param proteinIdType A character string specifying the type of protein ID.
#' It can be either "Uniprot" or "Uniprot_Mnemonic".
#' @return A data frame with populated Uniprot IDs.
.populateUniprotIdsInDataFrame <- function(df, proteinIdType) {
if ("GlobalProtein" %in% colnames(df)) {
protein_ids = unique(as.character(df$GlobalProtein))
} else {
df$Protein = as.character(df$Protein)
df$GlobalProtein = ifelse(grepl("_[A-Z][0-9]", df$Protein),
gsub("_[A-Z][0-9].*", "", df$Protein, perl = TRUE),
df$Protein)
protein_ids = unique(df$GlobalProtein)
}
df$UniprotId <- NA
if (proteinIdType == "Uniprot") {
df$UniprotId <- as.character(df$GlobalProtein)
}

if (proteinIdType == "Uniprot_Mnemonic") {
mnemonicProteins <- protein_ids
if (length(mnemonicProteins) > 0) {
uniprotMapping <- .callGetUniprotIdsFromUniprotMnemonicIdsApi(as.list(mnemonicProteins))
for (mnemonicId in names(uniprotMapping)) {
if (!is.null(uniprotMapping[[mnemonicId]])) {
df$UniprotId[df$GlobalProtein == mnemonicId] <- uniprotMapping[[mnemonicId]]
}
}
}
}

if (proteinIdType == "Hgnc_Name") {
df$UniprotId <- NA
}
return(df)
}

#' Populate HGNC IDs in Data Frame
#'
#' This function populates the HGNC IDs in the data frame based on the Uniprot IDs.
#'
#' @param df A data frame containing protein information.
#' @param proteinIdType A character string specifying the type of protein ID.
#' It can be either "Uniprot", "Uniprot_Mnemonic", or "Hgnc_Name".
#' @return A data frame with populated HGNC IDs.
.populateHgncIdsInDataFrame <- function(df, proteinIdType) {
df$HgncId <- NA
if (proteinIdType == "Uniprot" || proteinIdType == "Uniprot_Mnemonic") {
validMask <- !is.na(df$UniprotId)
validUniprots <- unique(df$UniprotId[validMask])
if (length(validUniprots) > 0) {
hgncMapping <- .callGetHgncIdsFromUniprotIdsApi(as.list(validUniprots))
for (uniprotId in names(hgncMapping)) {
if (!is.null(hgncMapping[[uniprotId]])) {
df$HgncId[df$UniprotId == uniprotId] <- hgncMapping[[uniprotId]]
}
}
}
} else {
hgncNames <- unique(df$GlobalProtein)
if (length(hgncNames) > 0) {
hgncMapping <- .callGetHgncIdsFromGildaApi(as.list(hgncNames))
for (hgncName in names(hgncMapping)) {
if (!is.null(hgncMapping[[hgncName]])) {
df$HgncId[df$GlobalProtein == hgncName] <- hgncMapping[[hgncName]]
}
}
}
}

return(df)
}

#' Populate HGNC Names in Data Frame
#'
#' This function populates the HGNC names in the data frame based on the HGNC IDs.
#'
#' @param df A data frame containing protein information.
#' @return A data frame with populated HGNC names.
.populateHgncNamesInDataFrame <- function(df) {
df$HgncName <- NA
validHgncMask <- !is.na(df$HgncId)
validHgncs <- unique(df$HgncId[validHgncMask])
if (length(validHgncs) > 0) {
nameMapping <- .callGetHgncNamesFromHgncIdsApi(as.list(validHgncs))
for (hgncId in names(nameMapping)) {
if (!is.null(nameMapping[[hgncId]])) {
df$HgncName[df$HgncId == hgncId] <- nameMapping[[hgncId]]
}
}
}
return(df)
}

#' Populate Transcription Factor Info in Data Frame
#'
#' This function populates the transcription factor information in the data frame based on the HGNC names.
#'
#' @param df A data frame containing protein information.
#' @return A data frame with populated transcription factor information.
.populateTranscriptionFactorInfoInDataFrame <- function(df) {
df$IsTranscriptionFactor <- NA
validNameMask <- !is.na(df$HgncName)
validNames <- unique(df$HgncName[validNameMask])
if (length(validNames) > 0) {
validNamesList <- as.list(validNames)
charMapping <- .callIsTranscriptionFactorApi(validNamesList)
for (hgncName in names(charMapping)) {
if (!is.null(charMapping[[hgncName]])) {
df$IsTranscriptionFactor[df$HgncName == hgncName] <- charMapping[[hgncName]]
}
}
}
return(df)
}

#' Populate Kinase Info in Data Frame
#'
#' This function populates the kinase information in the data frame based on the HGNC names.
#'
#' @param df A data frame containing protein information.
#' @return A data frame with populated kinase information.
.populateKinaseInfoInDataFrame <- function(df) {
df$IsKinase <- NA
validNameMask <- !is.na(df$HgncName)
validNames <- unique(df$HgncName[validNameMask])
if (length(validNames) > 0) {
validNamesList <- as.list(validNames)
charMapping <- .callIsKinaseApi(validNamesList)
for (hgncName in names(charMapping)) {
if (!is.null(charMapping[[hgncName]])) {
df$IsKinase[df$HgncName == hgncName] <- charMapping[[hgncName]]
}
}
}
return(df)
}

#' Populate Phosphatase Info in Data Frame
#'
#' This function populates the phosphatase information in the data frame based on the HGNC names.
#'
#' @param df A data frame containing protein information.
#' @return A data frame with populated phosphatase information.
.populatePhophataseInfoInDataFrame <- function(df) {
df$IsPhosphatase <- NA
validNameMask <- !is.na(df$HgncName)
validNames <- unique(df$HgncName[validNameMask])
if (length(validNames) > 0) {
validNamesList <- as.list(validNames)
charMapping <- .callIsPhosphataseApi(validNamesList)
for (hgncName in names(charMapping)) {
if (!is.null(charMapping[[hgncName]])) {
df$IsPhosphatase[df$HgncName == hgncName] <- charMapping[[hgncName]]
}
}
}
return(df)
}
```