#Notes: #The following code is performed using R and requires the packages "DGEobj.utils" (for CPM and TPM normalization) and "dplyr" (for filtering based on count level) #A "gene_lengths" file is needed for TPM normalization, but not CPM normalization. To create this file, we summed the lengths of each possible exon for a given gene, but other methods can be implemented to generate your own gene_lengths file #CPM and TPM normalization x <- read.delim("raw_counts.txt",row.names="Genes",check.names=FALSE) #different file types can be used, just modify the code lengths <- read.delim("gene_lengths.txt",row.names="Name",check.names=FALSE) x <- data.matrix(x) lengths <- as.matrix(lengths) library(DGEobj.utils) CPM <- convertCounts(x, unit = "CPM", geneLength = lengths, log = FALSE, normalize = "NONE") TPM <- convertCounts(x, unit = "TPM", geneLength = lengths, log = FALSE, normalize = "NONE") write.csv(CPM, file="CPM_counts.csv") write.csv(TPM, file="TPM_counts.csv") #Count filtering: this process removes any gene or retroelement that does not have a count above "1" in at least one sample library(dplyr) data <- read.csv("TPM_counts.csv") filtered_data <- data %>% filter(rowSums(.[, -1] >= 1) > 0) write.csv(filtered_data, "filtered_TPM_counts.csv", row.names = FALSE)