Dear all, Please be informed that, due to an important technical maintenance, the Gitlab server (git.list.lu) will not be available on Thursday April 22nd, from 9 A.M. to 1 P.M. (Luxembourg Time Zone). Thank you for your understanding.

Commit 0efbd2ae authored by Fintan McGee's avatar Fintan McGee

merged differerent version of the preprocessing files

parent 053bf118
...@@ -25,13 +25,14 @@ PrepareDataForInteractiveBinning <- function(dataset.name, ...@@ -25,13 +25,14 @@ PrepareDataForInteractiveBinning <- function(dataset.name,
data.consensus_length <- Biostrings::width(fasta) data.consensus_length <- Biostrings::width(fasta)
data.gc_content <- as.vector(Biostrings::letterFrequency(fasta, letters="CG", as.prob = TRUE)) data.gc_content <- as.vector(Biostrings::letterFrequency(fasta, letters="CG", as.prob = TRUE))
data.tnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 4, as.prob = TRUE)) data.tnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 4, as.prob = TRUE))
#data.pnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 5, as.prob = TRUE)) data.pnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 5, as.prob = TRUE))
data <- data.frame(row.names = NULL, data <- data.frame(row.names = NULL,
CONTIG = names(fasta), CONTIG = names(fasta),
GC_CONTENT = data.gc_content, GC_CONTENT = data.gc_content,
LENGTH = data.consensus_length, LENGTH = data.consensus_length,
data.tnf) #, data.pnf) data.tnf,
data.pnf)
print("* Reading average coverages...") print("* Reading average coverages...")
abundance <- read.csv(file.abundance) abundance <- read.csv(file.abundance)
...@@ -39,7 +40,7 @@ PrepareDataForInteractiveBinning <- function(dataset.name, ...@@ -39,7 +40,7 @@ PrepareDataForInteractiveBinning <- function(dataset.name,
# Some basic error checking # Some basic error checking
if(! ("CONTIG" %in% colnames(abundance))) { if(! ("CONTIG" %in% colnames(abundance))) {
warning("Required field 'CONTIG' not found in abundance file.") warning("Required field 'contig' not found in abundance file.")
return(FALSE) return(FALSE)
} }
...@@ -49,41 +50,42 @@ PrepareDataForInteractiveBinning <- function(dataset.name, ...@@ -49,41 +50,42 @@ PrepareDataForInteractiveBinning <- function(dataset.name,
warning("Not all contig identifiers from the fasta and the abundance file are equal.") warning("Not all contig identifiers from the fasta and the abundance file are equal.")
return(FALSE) return(FALSE)
} }
data <- plyr::join(data, abundance, by="CONTIG") data <- merge(data, abundance, by="CONTIG")
cluster.results <- NA cluster.results <- NA
if (!is.null(file.clusterings)) { if (!is.null(file.clusterings)) {
print("* Reading clustering results...")
cluster.results <- read.csv(file.clusterings) cluster.results <- read.csv(file.clusterings)
names(cluster.results) <- toupper(names(cluster.results)) names(cluster.results) <- toupper(names(cluster.results))
stopifnot("CONTIG" %in% names(cluster.results)) stopifnot("CONTIG" %in% names(cluster.results))
data <- plyr::join(data, cluster.results, by="CONTIG") data <- merge(data, cluster.results, by="CONTIG")
} }
print("* Reading essential single copy genes...") # We're currently not using the fasta in the prototype so let's not add it ot
# the dataset for now.
#assign(paste(dataset.name, "fasta", sep="."), fasta)
assign(paste(dataset.name, "escg", sep="."), ExtractESCG(file.escg)) assign(paste(dataset.name, "escg", sep="."), ExtractESCG(file.escg))
assign(dataset.name, data) assign(dataset.name, data)
print("* COnstructing schema...") print("* COnstructing schema...")
nnucleotides <- dim(data.tnf)[2] nnucleotides <- dim(data.tnf)[2]
#npentanucleotides <- dim(data.pnf)[2] npentanucleotides <- dim(data.pnf)[2]
nsamples <- ncol(get(dataset.name)) - 3 - nnucleotides #- npentanucleotides # 3: contig, gc, length nsamples <- ncol(get(dataset.name)) - 3 - nnucleotides - npentanucleotides # 3: contig, gc, length
# Now create the schema # Now create the schema
type <- c("character", # contig Id type <- c("character", # contig Id
"numeric", # GC Contig Properties "numeric", # GC Contig Properties
"integer", # Consensus_length Contig Properties "integer", # Consensus_length Contig Properties
rep("numeric", nnucleotides), # Tetra nucleotide frequencies rep("numeric", nnucleotides), # Tetra nucleotide frequencies
#rep("numeric", npentanucleotides), # Penta nucleotide frequencies rep("numeric", npentanucleotides), # Penta nucleotide frequencies
rep("integer", nsamples)) # Sample Abundances rep("integer", nsamples)) # Sample Abundances
group <- c("Id", group <- c("Id",
rep("Contig properties", 2), rep("Contig properties", 2),
rep("Tetra nucleotide frequencies", nnucleotides), rep("Tetra nucleotide frequencies", nnucleotides),
#rep("Penta nucleotide frequencies", npentanucleotides), rep("Penta nucleotide frequencies", npentanucleotides),
rep("Sample abundances", nsamples)) rep("Sample abundances", nsamples))
group_type <- c("Id", group_type <- c("Id",
rep("Characteristics", 2), rep("Characteristics", 2),
rep("Frequencies", nnucleotides), rep("Frequencies", nnucleotides),
#rep("Frequencies", npentanucleotides), rep("Frequencies", npentanucleotides),
rep("TimeSeries", nsamples)) rep("TimeSeries", nsamples))
schema <- data.frame(name = names(get(dataset.name)), schema <- data.frame(name = names(get(dataset.name)),
......
...@@ -7,16 +7,26 @@ source("R.preprocessing/SymmetrizedSignatures.R") ...@@ -7,16 +7,26 @@ source("R.preprocessing/SymmetrizedSignatures.R")
source("R.preprocessing/ExtractESCG.R") source("R.preprocessing/ExtractESCG.R")
source("R.preprocessing/PrepareDataForInteractiveBinning.R") source("R.preprocessing/PrepareDataForInteractiveBinning.R")
# Prepare the Wrighton data set # Prepare the CSTR data set
PrepareDataForInteractiveBinning( PrepareDataForInteractiveBinning(
dataset.name = "wrighton", dataset.name = "cstr",
file.fasta = "data//wrighton_assembly.fasta.gz", file.fasta = "data//CSTRmetagenomics.fasta",
file.abundance = "data//wrighton_avg_cov.csv", file.abundance = "data//cstr_coverage.csv",
file.escg = "data//wrighton_escg.csv", file.escg = "data/cstr_escg.csv",
file.clusterings = "data//wrighton_clusterings.csv", file.clusterings = "data//cstr_clusterings.csv",
dir.result = "R.ICoVeR//data" dir.result = "R.ICoVeR//data"
) )
# Prepare the Wrighton data set
# PrepareDataForInteractiveBinning(
# dataset.name = "wrighton",
# file.fasta = "data//wrighton_assembled.fasta",
# file.abundance = "data//wrighton_avg_coverage.csv",
# file.escg = "data//wrighton_escg.csv",
# file.clusterings = "data//wrighton_clusterings.csv",
# dir.result = "R.ICoVeR//data"
# )
# Install the ICoVeR package after data generation. # Install the ICoVeR package after data generation.
# NOTE: Before running install_local, make sure that R.ICoVeR/R/sqlite.R # NOTE: Before running install_local, make sure that R.ICoVeR/R/sqlite.R
# is configured properly. The variable p.db.dataset must be assigned the # is configured properly. The variable p.db.dataset must be assigned the
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment