diff --git a/R.preprocessing/PrepareDataForInteractiveBinning.R b/R.preprocessing/PrepareDataForInteractiveBinning.R index 843f33e3e1432c82276b2fed6d8daf53db0e6a65..7e29b66b232c1f7d1a43110c9e2be700946c70fc 100644 --- a/R.preprocessing/PrepareDataForInteractiveBinning.R +++ b/R.preprocessing/PrepareDataForInteractiveBinning.R @@ -25,13 +25,14 @@ PrepareDataForInteractiveBinning <- function(dataset.name, data.consensus_length <- Biostrings::width(fasta) data.gc_content <- as.vector(Biostrings::letterFrequency(fasta, letters="CG", as.prob = TRUE)) data.tnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 4, as.prob = TRUE)) - #data.pnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 5, as.prob = TRUE)) + data.pnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 5, as.prob = TRUE)) data <- data.frame(row.names = NULL, CONTIG = names(fasta), GC_CONTENT = data.gc_content, LENGTH = data.consensus_length, - data.tnf) #, data.pnf) + data.tnf, + data.pnf) print("* Reading average coverages...") abundance <- read.csv(file.abundance) @@ -39,7 +40,7 @@ PrepareDataForInteractiveBinning <- function(dataset.name, # Some basic error checking if(! ("CONTIG" %in% colnames(abundance))) { - warning("Required field 'CONTIG' not found in abundance file.") + warning("Required field 'contig' not found in abundance file.") return(FALSE) } @@ -49,41 +50,42 @@ PrepareDataForInteractiveBinning <- function(dataset.name, warning("Not all contig identifiers from the fasta and the abundance file are equal.") return(FALSE) } - data <- plyr::join(data, abundance, by="CONTIG") + data <- merge(data, abundance, by="CONTIG") cluster.results <- NA if (!is.null(file.clusterings)) { - print("* Reading clustering results...") cluster.results <- read.csv(file.clusterings) names(cluster.results) <- toupper(names(cluster.results)) stopifnot("CONTIG" %in% names(cluster.results)) - data <- plyr::join(data, cluster.results, by="CONTIG") + data <- merge(data, cluster.results, by="CONTIG") } - print("* Reading essential single copy genes...") + # We're currently not using the fasta in the prototype so let's not add it ot + # the dataset for now. + #assign(paste(dataset.name, "fasta", sep="."), fasta) assign(paste(dataset.name, "escg", sep="."), ExtractESCG(file.escg)) assign(dataset.name, data) print("* COnstructing schema...") nnucleotides <- dim(data.tnf)[2] - #npentanucleotides <- dim(data.pnf)[2] - nsamples <- ncol(get(dataset.name)) - 3 - nnucleotides #- npentanucleotides # 3: contig, gc, length + npentanucleotides <- dim(data.pnf)[2] + nsamples <- ncol(get(dataset.name)) - 3 - nnucleotides - npentanucleotides # 3: contig, gc, length # Now create the schema type <- c("character", # contig Id "numeric", # GC Contig Properties "integer", # Consensus_length Contig Properties rep("numeric", nnucleotides), # Tetra nucleotide frequencies - #rep("numeric", npentanucleotides), # Penta nucleotide frequencies + rep("numeric", npentanucleotides), # Penta nucleotide frequencies rep("integer", nsamples)) # Sample Abundances group <- c("Id", rep("Contig properties", 2), rep("Tetra nucleotide frequencies", nnucleotides), - #rep("Penta nucleotide frequencies", npentanucleotides), + rep("Penta nucleotide frequencies", npentanucleotides), rep("Sample abundances", nsamples)) group_type <- c("Id", rep("Characteristics", 2), rep("Frequencies", nnucleotides), - #rep("Frequencies", npentanucleotides), + rep("Frequencies", npentanucleotides), rep("TimeSeries", nsamples)) schema <- data.frame(name = names(get(dataset.name)), diff --git a/R.preprocessing/preprocessing.R b/R.preprocessing/preprocessing.R index 76e314dd2dee565bcf333cee53ff100c8eab9c37..f1243565f32e07c6ef83d0898a3c1a44b916d6a9 100644 --- a/R.preprocessing/preprocessing.R +++ b/R.preprocessing/preprocessing.R @@ -7,16 +7,26 @@ source("R.preprocessing/SymmetrizedSignatures.R") source("R.preprocessing/ExtractESCG.R") source("R.preprocessing/PrepareDataForInteractiveBinning.R") -# Prepare the Wrighton data set +# Prepare the CSTR data set PrepareDataForInteractiveBinning( - dataset.name = "wrighton", - file.fasta = "data//wrighton_assembly.fasta.gz", - file.abundance = "data//wrighton_avg_cov.csv", - file.escg = "data//wrighton_escg.csv", - file.clusterings = "data//wrighton_clusterings.csv", - dir.result = "R.ICoVeR//data" + dataset.name = "cstr", + file.fasta = "data//CSTRmetagenomics.fasta", + file.abundance = "data//cstr_coverage.csv", + file.escg = "data/cstr_escg.csv", + file.clusterings = "data//cstr_clusterings.csv", + dir.result = "R.ICoVeR//data" ) +# Prepare the Wrighton data set +# PrepareDataForInteractiveBinning( +# dataset.name = "wrighton", +# file.fasta = "data//wrighton_assembled.fasta", +# file.abundance = "data//wrighton_avg_coverage.csv", +# file.escg = "data//wrighton_escg.csv", +# file.clusterings = "data//wrighton_clusterings.csv", +# dir.result = "R.ICoVeR//data" +# ) + # Install the ICoVeR package after data generation. # NOTE: Before running install_local, make sure that R.ICoVeR/R/sqlite.R # is configured properly. The variable p.db.dataset must be assigned the