Commit 0efbd2ae authored by Fintan McGee's avatar Fintan McGee
Browse files

merged differerent version of the preprocessing files

parent 053bf118
......@@ -25,13 +25,14 @@ PrepareDataForInteractiveBinning <- function(dataset.name,
data.consensus_length <- Biostrings::width(fasta)
data.gc_content <- as.vector(Biostrings::letterFrequency(fasta, letters="CG", as.prob = TRUE))
data.tnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 4, as.prob = TRUE))
#data.pnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 5, as.prob = TRUE))
data.pnf <- SymmetrizedSignatures(FrequenciesSignatures(fasta, width = 5, as.prob = TRUE))
data <- data.frame(row.names = NULL,
CONTIG = names(fasta),
GC_CONTENT = data.gc_content,
LENGTH = data.consensus_length,
data.tnf) #, data.pnf)
data.tnf,
data.pnf)
print("* Reading average coverages...")
abundance <- read.csv(file.abundance)
......@@ -39,7 +40,7 @@ PrepareDataForInteractiveBinning <- function(dataset.name,
# Some basic error checking
if(! ("CONTIG" %in% colnames(abundance))) {
warning("Required field 'CONTIG' not found in abundance file.")
warning("Required field 'contig' not found in abundance file.")
return(FALSE)
}
......@@ -49,41 +50,42 @@ PrepareDataForInteractiveBinning <- function(dataset.name,
warning("Not all contig identifiers from the fasta and the abundance file are equal.")
return(FALSE)
}
data <- plyr::join(data, abundance, by="CONTIG")
data <- merge(data, abundance, by="CONTIG")
cluster.results <- NA
if (!is.null(file.clusterings)) {
print("* Reading clustering results...")
cluster.results <- read.csv(file.clusterings)
names(cluster.results) <- toupper(names(cluster.results))
stopifnot("CONTIG" %in% names(cluster.results))
data <- plyr::join(data, cluster.results, by="CONTIG")
data <- merge(data, cluster.results, by="CONTIG")
}
print("* Reading essential single copy genes...")
# We're currently not using the fasta in the prototype so let's not add it ot
# the dataset for now.
#assign(paste(dataset.name, "fasta", sep="."), fasta)
assign(paste(dataset.name, "escg", sep="."), ExtractESCG(file.escg))
assign(dataset.name, data)
print("* COnstructing schema...")
nnucleotides <- dim(data.tnf)[2]
#npentanucleotides <- dim(data.pnf)[2]
nsamples <- ncol(get(dataset.name)) - 3 - nnucleotides #- npentanucleotides # 3: contig, gc, length
npentanucleotides <- dim(data.pnf)[2]
nsamples <- ncol(get(dataset.name)) - 3 - nnucleotides - npentanucleotides # 3: contig, gc, length
# Now create the schema
type <- c("character", # contig Id
"numeric", # GC Contig Properties
"integer", # Consensus_length Contig Properties
rep("numeric", nnucleotides), # Tetra nucleotide frequencies
#rep("numeric", npentanucleotides), # Penta nucleotide frequencies
rep("numeric", npentanucleotides), # Penta nucleotide frequencies
rep("integer", nsamples)) # Sample Abundances
group <- c("Id",
rep("Contig properties", 2),
rep("Tetra nucleotide frequencies", nnucleotides),
#rep("Penta nucleotide frequencies", npentanucleotides),
rep("Penta nucleotide frequencies", npentanucleotides),
rep("Sample abundances", nsamples))
group_type <- c("Id",
rep("Characteristics", 2),
rep("Frequencies", nnucleotides),
#rep("Frequencies", npentanucleotides),
rep("Frequencies", npentanucleotides),
rep("TimeSeries", nsamples))
schema <- data.frame(name = names(get(dataset.name)),
......
......@@ -7,16 +7,26 @@ source("R.preprocessing/SymmetrizedSignatures.R")
source("R.preprocessing/ExtractESCG.R")
source("R.preprocessing/PrepareDataForInteractiveBinning.R")
# Prepare the Wrighton data set
# Prepare the CSTR data set
PrepareDataForInteractiveBinning(
dataset.name = "wrighton",
file.fasta = "data//wrighton_assembly.fasta.gz",
file.abundance = "data//wrighton_avg_cov.csv",
file.escg = "data//wrighton_escg.csv",
file.clusterings = "data//wrighton_clusterings.csv",
dir.result = "R.ICoVeR//data"
dataset.name = "cstr",
file.fasta = "data//CSTRmetagenomics.fasta",
file.abundance = "data//cstr_coverage.csv",
file.escg = "data/cstr_escg.csv",
file.clusterings = "data//cstr_clusterings.csv",
dir.result = "R.ICoVeR//data"
)
# Prepare the Wrighton data set
# PrepareDataForInteractiveBinning(
# dataset.name = "wrighton",
# file.fasta = "data//wrighton_assembled.fasta",
# file.abundance = "data//wrighton_avg_coverage.csv",
# file.escg = "data//wrighton_escg.csv",
# file.clusterings = "data//wrighton_clusterings.csv",
# dir.result = "R.ICoVeR//data"
# )
# Install the ICoVeR package after data generation.
# NOTE: Before running install_local, make sure that R.ICoVeR/R/sqlite.R
# is configured properly. The variable p.db.dataset must be assigned the
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment