The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.
Genular: An Integrated Knowledgebase Platform for Revealing Cell-Specific Genetic Signatures and Functional Insights ================ Ivan Tomic info@ivantomic.com
The genular package provides a comprehensive toolkit for
interacting with the Genular
Database API, enabling efficient retrieval, analysis, and
exploration of genomic and single-cell expression data directly within
the R environment.
You can install the released version of genular from CRAN with:
install.packages("genular")Or you can install genular directly from GitHub with use
of following commands:
install.packages("devtools")
devtools::install_github("atomiclaboratory/genular-database", subdir = 'libraries/genular-api/R-package')# Check if plyr is installed and install it if necessary
if (!requireNamespace("plyr", quietly = TRUE)) {
install.packages("plyr")
}
if (!requireNamespace("dplyr", quietly = TRUE)) {
install.packages("dplyr")
}
# Load the genular r-package
library(genular)
# Your personal API Key
# (request here or use your own installation: https://genular.atomic-lab.org/contact)
API_KEY <- "3147dda5fa023c9763d38bddb472dd28"# This example demonstrates how to suggest relevant cell types based on provided keywords. We use the `cells_suggest()` function to find cell types related to specific terms like "endothelial cell" and "T cell".
# Define the query values for cell type suggestions
queryValues <- c("endothelial cell", "T cell")
# Use the cells_suggest function to get relevant cell type matches
search_example_1 <- cells_suggest(queryValues,
options = list(api_key = API_KEY, timeout = 5000)
)
# Display the suggested cell types
# print(head(search_example_1))# In this example, we demonstrate how to search for specific cell information based on conditions applied to multiple cell IDs. The `cells_search()` function is used to retrieve data about cells that meet the specified criteria.
# Define query conditions for different cell IDs
queryValues <- list(
"CL0002350" = ">= 565",
"CL2000041" = ">= 323",
"CL0000129" = ">= 1821",
"CL0000453" = ">= 299"
)
# Define the fields to filter in the search results
fieldsFilter <- c(
"geneID",
"symbol",
"crossReference.enseGeneID",
"singleCellExpressions.effectSizes.i",
"singleCellExpressions.effectSizes.s",
"singleCellExpressions.effectSizes.c"
)
# Perform the cell search
search_example_2 <- cells_search(
queryValues,
fieldsFilter,
page = 1,
limit = 10,
searchType = "or",
orderBy = "geneID",
sortDirection = "asc",
responseType = "json",
organismType = list(9606),
options = list(api_key = "3147dda5fa023c9763d38bddb472dd28", timeout = 10000)
)
# Display the results
print(head(sapply(search_example_2$content$results, function(x) x$symbol)))
#> [1] "A2M" "ABR" "ACAA1" "ACADVL" "ASIC2" "ACP1"# This example demonstrates how to search for specific gene information using gene IDs. We use the `gene_search()` function to query gene-related details such as gene symbols and Ensembl gene IDs.
# Define the fields to search for and query gene IDs
queryFields <- list(c("geneID"))
queryValues <- c(1, 56, 70) # Specify the gene IDs to search for
searchType <- "or"
fieldsFilter <- c("geneID", "symbol", "crossReference.enseGeneID")
organismType = c(9606)
# Perform the gene search
search_example_3 <- gene_search(queryFields, queryValues, fieldsFilter, searchType = searchType,
organismType = organismType, page = 1, limit = 10,
options = list(api_key = API_KEY, timeout = 1000))
# Display the results
print(head(sapply(search_example_3$content$results, function(x) x$symbol)))
#> [1] "A1BG" "ACRV1" "ACTC1"# In this example, we demonstrate how to use keywords to suggest relevant biological pathways. We use the `pathways_suggest()` function to find pathways related to specific biological processes or activities, such as "apoptosis" and "signal transduction".
# Define the query values for pathway suggestion
queryValues <- c("apoptosis", "signal transduction")
# Use the pathways_suggest function to get relevant pathway matches
search_example_4 <- pathways_suggest(queryValues)
# Display the suggested pathways
print(head(sapply(search_example_4$results, function(x) x$term)))
#> [1] "Apoptosis" "Signal Transduction"
#> [3] "Signal transduction by L1" "Apoptosis induced DNA fragmentation"
#> [5] "Signaling by NODAL" "Signaling by FGFR in disease"# This example demonstrates how to perform feature engineering on gene expression data using external gene-related information. The goal is to convert raw gene expression data into pathway-level features for further analysis.
# Define gene expression data for a set of genes across multiple samples
# Demo data HIPC - Expression data with meta
remote_data <- "https://genular.ams3.cdn.digitaloceanspaces.com/database-export/hipc_expression_data_with_meta.csv"
input_data <- read.csv2(remote_data, header = TRUE, sep = ",", dec = ".", quote = "\"", fill = TRUE, comment.char = "")
library(stats)
# START DATA PREPROCESSING
# This is just an example of how to preprocess the data before using the genular API
# If you have raw gene expression data, probbably you will need to preprocess it before using the Genular API (eg. DESeq2)
metadata <- input_data[, 1:36] # Extract the first 36 columns as metadata
input_data_counts <- input_data[, 37:ncol(input_data)] # Extract gene expression data
scaled_data <- scale(input_data_counts)
# Perform PCA on the scaled data
pca_results <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
# Get the explained variance proportions for each principal component
explained_variance <- summary(pca_results)$importance[2,]
# Find the number of principal components explaining 90% of the variance
cumulative_variance <- cumsum(explained_variance)
num_components <- which(cumulative_variance >= 0.90)[1]
# Get the PCA loadings (rotation matrix)
loadings <- pca_results$rotation[, 1:num_components]
# Calculate the variance contribution of each variable by squaring the loadings
# and multiplying by the variance explained by the respective component
variance_contributions <- rowSums((loadings^2) * explained_variance[1:num_components])
# Rank variables by their contribution to the variance
ranked_variables <- sort(variance_contributions, decreasing = TRUE)
# Determine how many variables to retain (e.g., 90% of the cumulative variance contribution)
cumulative_var_contrib <- cumsum(ranked_variables) / sum(ranked_variables)
# Find the number of variables that cumulatively contribute to 0.75% of the variance explained by these variables
num_important_variables <- which(cumulative_var_contrib >= 0.75)[1]
# Select the top contributing variables
important_variables <- names(ranked_variables[1:num_important_variables])
# Subset input_data_counts to include only the important variables
input_data_filtered <- input_data_counts[, important_variables]
# END DATA PREPROCESSING
### Fetch gene-related data from genular database
# Using `fetch_all_gene_search_results()`, we query gene information for the given symbols in `input_data`
search_example_5 <- fetch_all_gene_search_results(
queryFields = list(c("symbol")), # Specify that the query is based on gene symbols
queryValues = colnames(input_data_filtered), # Gene symbols extracted from the column names of 'input_data'
fieldsFilter = c("geneID", "symbol", "crossReference.enseGeneID",
"mRNAExpressions.proteinAtlas.c",
"ontology.id", "ontology.term", "ontology.cat"), # Fields to be fetched
searchType = "or", # Search type specifying how to combine query fields
orderBy = "geneID", # Order the results by gene ID
sortDirection = "asc", # Sort direction: ascending
responseType = "json", # Format of the response
matchType = "exact", # Type of matching to use for the query
organismType = list(c(9606)), # Specify the organism type (e.g., Homo sapiens)
ontologyCategories = list(), # Specify ontology categories if needed
limit = 100, # Limit the number of results returned
options = list(api_key = API_KEY, timeout = 1000) # API key and timeout for the request
)
all_gene_symbols <- sapply(search_example_5, function(x) x$symbol)
gene_not_found <- setdiff(colnames(input_data_filtered), all_gene_symbols)
input_data_filtered <- input_data_filtered[, setdiff(colnames(input_data_filtered), gene_not_found)]
## Transform and restructure the fetched gene-related data
# The `extract_data()` function helps map various gene attributes to a more usable format
data_transposed <- extract_data(search_example_5, list(
"geneID" = "mappedGeneID", # Mapping gene IDs
"symbol" = "mappedSymbol", # Mapping gene symbols
"crossReference$enseGeneID" = "mappedEnseGeneID", # Mapping Ensembl gene IDs
"mRNAExpressions$proteinAtlas" = list(c("c" = "mappedC")), # Mapping mRNA expression
"ontology" = list(c("id" = "mappedId", "term" = "mappedTerm", "cat" = "mappedCat")) # Mapping ontology data
))
# Remove rows with missing ontology information
data_transposed <- data_transposed[!is.na(data_transposed$mappedId), ]
# View the first few rows of the transposed data
print(head(data_transposed[order(data_transposed$mappedId), ]))
## Convert gene expression data to pathway-level features / this can take a while
# Using the `convert_gene_expression_to_pathway_features()` function, we generate PathwayGeneScore and pathway-level features
final_data <- convert_gene_expression_to_pathway_features(input_data_filtered, data_transposed, T)
# (Optional) Remove NA columns from the final data
final_data <- final_data[, colSums(!is.na(final_data)) > 0]
# View the final data
print(head(final_data))
# Continue with downstream analysis using the pathway-level features# This example demonstrates how to retrieve genes associated with the "Adaptive Immune System" and process their corresponding cell marker scores for further analysis. We use `fetch_all_gene_search_results()` to obtain gene information, followed by data filtering and visualization steps.
# Fetch genes related to "Adaptive Immune System"
search_example_6 <- fetch_all_gene_search_results(
queryFields = list(c("ontology.term")),
queryValues = list(c("Adaptive Immune System")),
fieldsFilter = c("geneID", "symbol", "crossReference.enseGeneID",
"singleCellExpressions.effectSizes.i",
"singleCellExpressions.effectSizes.c",
"singleCellExpressions.effectSizes.s",
"singleCellExpressions.effectSizes.foldChange"),
searchType = "or",
orderBy = "geneID",
sortDirection = "asc",
responseType = "json",
matchType = "exact",
organismType = list(c(9606)),
limit = 100,
options = list(api_key = API_KEY, timeout = 1000)
)
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
filter_effect_sizes_top_percentile <- function(gene, percentile = 90) {
# Extract all foldChange values for the current gene
fold_changes <- map_dbl(gene$singleCellExpressions$effectSizes, ~ .x$foldChange)
# Calculate the specified percentile (default is 90th percentile)
cutoff <- quantile(fold_changes, probs = percentile / 100, na.rm = TRUE)
# Filter effectSizes to retain only those with foldChange >= cutoff
filtered_effect_sizes <- gene$singleCellExpressions$effectSizes %>%
keep(~ .x$foldChange >= cutoff)
# Update the gene's effectSizes with the filtered list
gene$singleCellExpressions$effectSizes <- filtered_effect_sizes
return(gene)
}
filter_effect_sizes_by_lineage <- function(gene, lineage = "root") {
# Filter effectSizes to keep only those with cell_lineage == 'root'
filtered_effect_sizes <- gene$singleCellExpressions$effectSizes %>%
keep(~ .x$cell_lineage == lineage)
# Update the gene's effectSizes with the filtered list
gene$singleCellExpressions$effectSizes <- filtered_effect_sizes
return(gene)
}
search_example_6_filtered <- search_example_6 %>%
map(~ filter_effect_sizes_top_percentile(.x, percentile = 95))
search_example_6_filtered <- search_example_6_filtered %>%
map(~ filter_effect_sizes_by_lineage(.x, lineage = "root"))
# Remove genes with no remaining effectSizes
search_example_6_filtered <- search_example_6_filtered %>%
purrr::keep(~ length(.x$singleCellExpressions$effectSizes) > 0)
# Sort genes by the number of cells in descending order
num_cells_per_gene <- sapply(search_example_6_filtered, function(gene) {
length(gene$singleCellExpressions$effectSizes)
})
ordered_indices <- order(num_cells_per_gene, decreasing = TRUE)
search_example_6_filtered_sorted <- search_example_6_filtered[ordered_indices]
# Convert the sorted and filtered list to a tidy data frame for visualization
df_final <- search_example_6_filtered_sorted %>%
map_dfr(function(gene) {
gene_info <- tibble(
geneID = gene$geneID,
symbol = gene$symbol,
ensemblID = gene$crossReference$enseGeneID
)
effect_sizes <- gene$singleCellExpressions$effectSizes %>%
map_dfr(~ tibble(
cell_id = .x$cell_id,
context = .x$context,
markerScore = .x$markerScore,
scoreThreshold = .x$scoreThreshold,
foldChange = .x$foldChange,
cell_lineage = .x$cell_lineage,
cell_term = .x$cell_term
))
bind_cols(gene_info, effect_sizes)
})
# Visualize fold changes by cell type for a selected gene
selected_gene <- "HMGB1"
df_filtered <- df_final %>% filter(symbol == selected_gene)
ggplot(df_filtered, aes(x = reorder(cell_term, -foldChange), y = foldChange)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = paste("Fold Change by Cell Type for Gene", selected_gene),
x = "Cell Type",
y = "Fold Change"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold")
)These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.