From 0efbd2ae8c7cba4fc09c7c0084989e6eb6c784db Mon Sep 17 00:00:00 2001 From: Fintan McGee Date: Wed, 5 Oct 2016 15:09:47 +0200 Subject: [PATCH] merged differerent version of the preprocessing files --- .../PrepareDataForInteractiveBinning.R | 26 ++++++++++--------- R.preprocessing/preprocessing.R | 24 ++++++++++++----- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/R.preprocessing/PrepareDataForInteractiveBinning.R b/R.preprocessing/PrepareDataForInteractiveBinning.R index 843f33e..7e29b66 100644 --- a/R.preprocessing/PrepareDataForInteractiveBinning.R +++ b/R.preprocessing/PrepareDataForInteractiveBinning.R @@ -25,13 +25,14 @@ PrepareDataForInteractiveBinning <- function(dataset.name, data.consensus_length <- Biostrings::width(fasta) data.gc_content <- as.vector(Biostrings::letterFrequency(fasta, letters="CG", as.prob = TRUE)) data.tnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 4, as.prob = TRUE)) - #data.pnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 5, as.prob = TRUE)) + data.pnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 5, as.prob = TRUE)) data <- data.frame(row.names = NULL, CONTIG = names(fasta), GC_CONTENT = data.gc_content, LENGTH = data.consensus_length, - data.tnf) #, data.pnf) + data.tnf, + data.pnf) print("* Reading average coverages...") abundance <- read.csv(file.abundance) @@ -39,7 +40,7 @@ PrepareDataForInteractiveBinning <- function(dataset.name, # Some basic error checking if(! ("CONTIG" %in% colnames(abundance))) { - warning("Required field 'CONTIG' not found in abundance file.") + warning("Required field 'contig' not found in abundance file.") return(FALSE) } @@ -49,41 +50,42 @@ PrepareDataForInteractiveBinning <- function(dataset.name, warning("Not all contig identifiers from the fasta and the abundance file are equal.") return(FALSE) } - data <- plyr::join(data, abundance, by="CONTIG") + data <- merge(data, abundance, by="CONTIG") cluster.results <- NA if (!is.null(file.clusterings)) { - print("* Reading clustering results...") cluster.results <- read.csv(file.clusterings) names(cluster.results) <- toupper(names(cluster.results)) stopifnot("CONTIG" %in% names(cluster.results)) - data <- plyr::join(data, cluster.results, by="CONTIG") + data <- merge(data, cluster.results, by="CONTIG") } - print("* Reading essential single copy genes...") + # We're currently not using the fasta in the prototype so let's not add it ot + # the dataset for now. + #assign(paste(dataset.name, "fasta", sep="."), fasta) assign(paste(dataset.name, "escg", sep="."), ExtractESCG(file.escg)) assign(dataset.name, data) print("* COnstructing schema...") nnucleotides <- dim(data.tnf)[2] - #npentanucleotides <- dim(data.pnf)[2] - nsamples <- ncol(get(dataset.name)) - 3 - nnucleotides #- npentanucleotides # 3: contig, gc, length + npentanucleotides <- dim(data.pnf)[2] + nsamples <- ncol(get(dataset.name)) - 3 - nnucleotides - npentanucleotides # 3: contig, gc, length # Now create the schema type <- c("character", # contig Id "numeric", # GC Contig Properties "integer", # Consensus_length Contig Properties rep("numeric", nnucleotides), # Tetra nucleotide frequencies - #rep("numeric", npentanucleotides), # Penta nucleotide frequencies + rep("numeric", npentanucleotides), # Penta nucleotide frequencies rep("integer", nsamples)) # Sample Abundances group <- c("Id", rep("Contig properties", 2), rep("Tetra nucleotide frequencies", nnucleotides), - #rep("Penta nucleotide frequencies", npentanucleotides), + rep("Penta nucleotide frequencies", npentanucleotides), rep("Sample abundances", nsamples)) group_type <- c("Id", rep("Characteristics", 2), rep("Frequencies", nnucleotides), - #rep("Frequencies", npentanucleotides), + rep("Frequencies", npentanucleotides), rep("TimeSeries", nsamples)) schema <- data.frame(name = names(get(dataset.name)), diff --git a/R.preprocessing/preprocessing.R b/R.preprocessing/preprocessing.R index 76e314d..f124356 100644 --- a/R.preprocessing/preprocessing.R +++ b/R.preprocessing/preprocessing.R @@ -7,16 +7,26 @@ source("R.preprocessing/SymmetrizedSignatures.R") source("R.preprocessing/ExtractESCG.R") source("R.preprocessing/PrepareDataForInteractiveBinning.R") -# Prepare the Wrighton data set +# Prepare the CSTR data set PrepareDataForInteractiveBinning( - dataset.name = "wrighton", - file.fasta = "data//wrighton_assembly.fasta.gz", - file.abundance = "data//wrighton_avg_cov.csv", - file.escg = "data//wrighton_escg.csv", - file.clusterings = "data//wrighton_clusterings.csv", - dir.result = "R.ICoVeR//data" + dataset.name = "cstr", + file.fasta = "data//CSTRmetagenomics.fasta", + file.abundance = "data//cstr_coverage.csv", + file.escg = "data/cstr_escg.csv", + file.clusterings = "data//cstr_clusterings.csv", + dir.result = "R.ICoVeR//data" ) +# Prepare the Wrighton data set +# PrepareDataForInteractiveBinning( +# dataset.name = "wrighton", +# file.fasta = "data//wrighton_assembled.fasta", +# file.abundance = "data//wrighton_avg_coverage.csv", +# file.escg = "data//wrighton_escg.csv", +# file.clusterings = "data//wrighton_clusterings.csv", +# dir.result = "R.ICoVeR//data" +# ) + # Install the ICoVeR package after data generation. # NOTE: Before running install_local, make sure that R.ICoVeR/R/sqlite.R # is configured properly. The variable p.db.dataset must be assigned the -- GitLab