#Notes: 
#The following code is performed using R and requires the packages "DGEobj.utils" (for CPM and TPM normalization) and "dplyr" (for filtering based on count level)
#A "gene_lengths" file is needed for TPM normalization, but not CPM normalization. To create this file, we summed the lengths of each possible exon for a given gene, but other methods can be implemented to generate your own gene_lengths file

#CPM and TPM normalization
x <- read.delim("raw_counts.txt",row.names="Genes",check.names=FALSE) #different file types can be used, just modify the code
lengths <- read.delim("gene_lengths.txt",row.names="Name",check.names=FALSE)
x <- data.matrix(x)
lengths <- as.matrix(lengths)
library(DGEobj.utils)

CPM <- convertCounts(x,
                            unit       = "CPM",
                            geneLength = lengths,
                            log        = FALSE,
                            normalize  = "NONE")

TPM <- convertCounts(x,
                            unit       = "TPM",
                            geneLength = lengths,
                            log        = FALSE,
                            normalize  = "NONE")

write.csv(CPM, file="CPM_counts.csv")
write.csv(TPM, file="TPM_counts.csv")

#Count filtering: this process removes any gene or retroelement that does not have a count above "1" in at least one sample
library(dplyr)
data <- read.csv("TPM_counts.csv")
filtered_data <- data %>%
  filter(rowSums(.[, -1] >= 1) > 0)
write.csv(filtered_data, "filtered_TPM_counts.csv", row.names = FALSE)