Skip to content
Snippets Groups Projects
Commit a1496118 authored by Lisa Fiedler's avatar Lisa Fiedler
Browse files

fixed boundary case exception for gene ends

parent f9c7c6db
No related branches found
No related tags found
No related merge requests found
...@@ -45,6 +45,7 @@ public class DeGeCI { ...@@ -45,6 +45,7 @@ public class DeGeCI {
protected static int maxD = 100; protected static int maxD = 100;
public static int transTable = -1; public static int transTable = -1;
public static int shift = 6; public static int shift = 6;
public static boolean DEBUG = false;
static public enum AmbigFilter { static public enum AmbigFilter {
REPLACE_FILTER, REPLACE_FILTER,
...@@ -2073,7 +2074,10 @@ public class DeGeCI { ...@@ -2073,7 +2074,10 @@ public class DeGeCI {
} }
// annotationDF.orderBy(ColumnIdentifier.START).show(50000); // annotationDF.orderBy(ColumnIdentifier.START).show(50000);
// SparkComputer.persistDataFrameORC(annotationDF,path+"/../PREDICTIONS"); if(DEBUG) {
SparkComputer.persistDataFrameORC(annotationDF,path+"/../PREDICTIONS");
}
if(transTable != -1) { if(transTable != -1) {
annotationDF = improveGeneEnds(annotationDF,transTable,shift,sequence); annotationDF = improveGeneEnds(annotationDF,transTable,shift,sequence);
...@@ -2360,7 +2364,7 @@ public class DeGeCI { ...@@ -2360,7 +2364,7 @@ public class DeGeCI {
List<Row> preditions = new ArrayList<>(); List<Row> preditions = new ArrayList<>();
Column[] columns = SparkComputer.getColumns(propertyRanges.columns()); Column[] columns = SparkComputer.getColumns(propertyRanges.columns());
// propertyRanges.orderBy(ColumnIdentifier.START).show(5000); if(DEBUG) propertyRanges.orderBy(ColumnIdentifier.START).show(5000);
propertyRanges = propertyRanges propertyRanges = propertyRanges
// only check end // only check end
.filter(col(ColumnIdentifier.START).gt(col(ColumnIdentifier.END))) .filter(col(ColumnIdentifier.START).gt(col(ColumnIdentifier.END)))
...@@ -2379,7 +2383,7 @@ public class DeGeCI { ...@@ -2379,7 +2383,7 @@ public class DeGeCI {
propertyRanges.filter(col(ColumnIdentifier.START).leq(col(ColumnIdentifier.END))).withColumn(ColumnIdentifier.FLAG,lit(3)) propertyRanges.filter(col(ColumnIdentifier.START).leq(col(ColumnIdentifier.END))).withColumn(ColumnIdentifier.FLAG,lit(3))
); );
// propertyRanges.orderBy(ColumnIdentifier.START).show(5000); if(DEBUG) propertyRanges.orderBy(ColumnIdentifier.START).show(5000);
for (Row propertyRange: propertyRanges.collectAsList()){ for (Row propertyRange: propertyRanges.collectAsList()){
int start = propertyRange.getInt(propertyRange.fieldIndex(ColumnIdentifier.START)); int start = propertyRange.getInt(propertyRange.fieldIndex(ColumnIdentifier.START));
int stop = propertyRange.getInt(propertyRange.fieldIndex(ColumnIdentifier.END))+1; int stop = propertyRange.getInt(propertyRange.fieldIndex(ColumnIdentifier.END))+1;
...@@ -2388,11 +2392,10 @@ public class DeGeCI { ...@@ -2388,11 +2392,10 @@ public class DeGeCI {
// System.out.println(propertyRange.getString(propertyRange.fieldIndex(ColumnIdentifier.PROPERTY)));
if(propertyRange.getString(propertyRange.fieldIndex(ColumnIdentifier.CATEGORY)).equals( "protein")) { if(propertyRange.getString(propertyRange.fieldIndex(ColumnIdentifier.CATEGORY)).equals( "protein")) {
if(DEBUG) System.out.println(propertyRange.getString(propertyRange.fieldIndex(ColumnIdentifier.PROPERTY)));
String subsequence = sequence.substring(start,stop); String subsequence = sequence.substring(start,stop);
// System.out.println(start+ " " + stop); // System.out.println(start+ " " + stop);
...@@ -2401,44 +2404,45 @@ public class DeGeCI { ...@@ -2401,44 +2404,45 @@ public class DeGeCI {
String startCodon = subsequence.substring(0,3); String startCodon = subsequence.substring(0,3);
String stopCodon = subsequence.substring(subsequence.length()-3); String stopCodon = subsequence.substring(subsequence.length()-3);
// System.out.println("Start codon " + startCodon); if(DEBUG) System.out.println("Start codon " + startCodon);
// System.out.println("Stop Codon" + stopCodon); if(DEBUG) System.out.println("Stop Codon" + stopCodon);
if(Arrays.asList(2,3).contains(propertyRange.getInt(propertyRange.fieldIndex(ColumnIdentifier.FLAG))) && !initiationCodes.get(transTable).contains(startCodon)) { if(Arrays.asList(2,3).contains(propertyRange.getInt(propertyRange.fieldIndex(ColumnIdentifier.FLAG))) && !initiationCodes.get(transTable).contains(startCodon)) {
// System.out.println("no start codon " + startCodon); if(DEBUG) System.out.println("no start codon " + startCodon);
for (int i = 1; i <= shift; i++) { for (int i = 1; i <= shift; i++) {
if(DEBUG) System.out.println(Auxiliary.mod(start+i,sequence.length()) +" " +Auxiliary.mod(start+i+3,sequence.length()));
String codon = sequence.substring(Auxiliary.mod(start+i,sequence.length()),Auxiliary.mod(start+i+3,sequence.length())); String codon = extractSubsequence(sequence,Auxiliary.mod(start+i,sequence.length()),Auxiliary.mod(start+i+3,sequence.length()));
if(DEBUG) System.out.println(codon);
if(terminationCodes.get(transTable).contains(codon)) { if(terminationCodes.get(transTable).contains(codon)) {
// System.out.println("Termination codon encountered"); if(DEBUG) System.out.println("Termination codon encountered");
// System.out.println("Codon: " +codon); if(DEBUG) System.out.println("Codon: " +codon);
break; break;
} }
if(initiationCodes.get(transTable).contains(codon)) { if(initiationCodes.get(transTable).contains(codon)) {
// System.out.println("Inition codon encountered"); if(DEBUG) System.out.println("Inition codon encountered");
// System.out.println("Codon: " +codon);_ if(DEBUG) System.out.println("Codon: " +codon);
offsetInit = i; offsetInit = i;
break; break;
} }
} }
if(offsetInit== 0){ if(offsetInit== 0){
for (int i = 1; i <= shift; i++) { for (int i = 1; i <= shift; i++) {
String codon = sequence.substring(Auxiliary.mod(start-i,sequence.length()),Auxiliary.mod(start-i+3,sequence.length())); String codon = extractSubsequence(sequence,Auxiliary.mod(start-i,sequence.length()),Auxiliary.mod(start-i+3,sequence.length()));
if(DEBUG) System.out.println(codon);
if(terminationCodes.get(transTable).contains(codon)) { if(terminationCodes.get(transTable).contains(codon)) {
// System.out.println("Termination codon encountered"); if(DEBUG) System.out.println("Termination codon encountered");
// System.out.println("Codon: " +codon); if(DEBUG) System.out.println("Codon: " +codon);
break; break;
} }
if(initiationCodes.get(transTable).contains(codon)) { if(initiationCodes.get(transTable).contains(codon)) {
// System.out.println("Inition codon encountered"); if(DEBUG) System.out.println("Inition codon encountered");
// System.out.println("Codon: " +codon); if(DEBUG) System.out.println("Codon: " +codon);
offsetInit = -i; offsetInit = -i;
break; break;
} }
...@@ -2448,30 +2452,30 @@ public class DeGeCI { ...@@ -2448,30 +2452,30 @@ public class DeGeCI {
} }
if(Arrays.asList(1,3).contains(propertyRange.getInt(propertyRange.fieldIndex(ColumnIdentifier.FLAG))) && !terminationCodes.get(transTable).contains(stopCodon)) { if(Arrays.asList(1,3).contains(propertyRange.getInt(propertyRange.fieldIndex(ColumnIdentifier.FLAG))) && !terminationCodes.get(transTable).contains(stopCodon)) {
// System.out.println("no stop codon " + stopCodon); if(DEBUG) System.out.println("no stop codon " + stopCodon);
for (int i = 1; i <= shift; i++) { for (int i = 1; i <= shift; i++) {
// System.out.println(Auxiliary.mod(stop-3+i,sequence.length()) + " " +Auxiliary.mod(stop-1+i,sequence.length())); if(DEBUG) System.out.println(Auxiliary.mod(stop-3+i,sequence.length()) + " " +Auxiliary.mod(stop-1+i,sequence.length()));
String codon = sequence.substring(Auxiliary.mod(stop-3+i,sequence.length()),Auxiliary.mod(stop+i,sequence.length())); String codon = extractSubsequence(sequence,Auxiliary.mod(stop-3+i,sequence.length()),Auxiliary.mod(stop+i,sequence.length()));
if(DEBUG) System.out.println(codon);
if(terminationCodes.get(transTable).contains(codon)) { if(terminationCodes.get(transTable).contains(codon)) {
// System.out.println("Termination codon encountered"); if(DEBUG) System.out.println("Termination codon encountered");
// System.out.println("Codon: " +codon); if(DEBUG) System.out.println("Codon: " +codon);
offsetTerm = i; offsetTerm = i;
break; break;
} }
} }
if(offsetTerm == 0){ if(offsetTerm == 0){
for (int i = 1; i <= shift; i++) { for (int i = 1; i <= shift; i++) {
// System.out.println(Auxiliary.mod(stop-3-i,sequence.length()) + " "+ Auxiliary.mod(stop-i,sequence.length())); if(DEBUG) System.out.println(Auxiliary.mod(stop-3-i,sequence.length()) + " "+ Auxiliary.mod(stop-i,sequence.length()));
String codon = sequence.substring(Auxiliary.mod(stop-3-i,sequence.length()),Auxiliary.mod(stop-i,sequence.length())); String codon = extractSubsequence(sequence,Auxiliary.mod(stop-3-i,sequence.length()),Auxiliary.mod(stop-i,sequence.length()));
if(DEBUG) System.out.println(codon);
if(terminationCodes.get(transTable).contains(codon)) { if(terminationCodes.get(transTable).contains(codon)) {
// System.out.println("Termination codon encountered"); if(DEBUG) System.out.println("Termination codon encountered");
// System.out.println("Codon: " +codon); if(DEBUG) System.out.println("Codon: " +codon);
offsetTerm = -i; offsetTerm = -i;
break; break;
} }
...@@ -2481,7 +2485,7 @@ public class DeGeCI { ...@@ -2481,7 +2485,7 @@ public class DeGeCI {
} }
// System.out.println("New position with offsets"+ offsetInit + " " + offsetTerm); if(DEBUG) System.out.println("New position with offsets"+ offsetInit + " " + offsetTerm);
} }
preditions.add( preditions.add(
...@@ -2508,6 +2512,15 @@ public class DeGeCI { ...@@ -2508,6 +2512,15 @@ public class DeGeCI {
); );
} }
private static String extractSubsequence(String sequence, int start, int end) {
if(start > end) {
// return sequence.substring(start) + sequence.substring(0,(end == 0) ? 1 : end);
return sequence.substring(start) + sequence.substring(0, end);
}
return sequence.substring(start,end);
}
public static void annotateRefseq(String name, File fastaFile, int taxId, boolean circular, public static void annotateRefseq(String name, File fastaFile, int taxId, boolean circular,
String dir, int weight, int minCount, int k, String refseq, double alpha, boolean keepIntermediateResults) throws IOException { String dir, int weight, int minCount, int k, String refseq, double alpha, boolean keepIntermediateResults) throws IOException {
String sequence = readSingleFastaFile(fastaFile.getAbsolutePath()); String sequence = readSingleFastaFile(fastaFile.getAbsolutePath());
...@@ -2608,6 +2621,7 @@ public class DeGeCI { ...@@ -2608,6 +2621,7 @@ public class DeGeCI {
} }
} }
public static void annotateRefseq(String name, String sequence, int taxId, boolean circular, public static void annotateRefseq(String name, String sequence, int taxId, boolean circular,
String dir, int weight, int minCount, int k, String refseq, double alpha, boolean keepIntermediateResults) throws IOException { String dir, int weight, int minCount, int k, String refseq, double alpha, boolean keepIntermediateResults) throws IOException {
...@@ -2655,7 +2669,7 @@ public class DeGeCI { ...@@ -2655,7 +2669,7 @@ public class DeGeCI {
// Dataset<Row> predictionsLS = SparkComputer.readORC(dir+"/JOINED_GENE_RANGES_LS"); // Dataset<Row> predictionsLS = SparkComputer.readORC(dir+"/JOINED_GENE_RANGES_LS");
} }
public static void clear(String path) throws InterruptedException { public static void clear(String path, String refseq) {
Spark.clearDirectory(path+"/BRIDGED"); Spark.clearDirectory(path+"/BRIDGED");
// TimeUnit.SECONDS.sleep(30); // TimeUnit.SECONDS.sleep(30);
Spark.clearDirectory(path+"/CLUSTER"); Spark.clearDirectory(path+"/CLUSTER");
...@@ -2664,6 +2678,9 @@ public class DeGeCI { ...@@ -2664,6 +2678,9 @@ public class DeGeCI {
// TimeUnit.SECONDS.sleep(30); // TimeUnit.SECONDS.sleep(30);
Spark.clearDirectory(path+"/JOINED_GENE_RANGES_LS"); Spark.clearDirectory(path+"/JOINED_GENE_RANGES_LS");
// TimeUnit.SECONDS.sleep(30); // TimeUnit.SECONDS.sleep(30);
Spark.clearDirectory(path+"/KMER_MAPPING_"+refseq);
Spark.clearDirectory(path+"/PREDICTIONS");
} }
......
...@@ -27,6 +27,7 @@ public class DeGeCIInputParser extends InputParser { ...@@ -27,6 +27,7 @@ public class DeGeCIInputParser extends InputParser {
protected void setArguments() { protected void setArguments() {
this.parser = ArgumentParsers.newFor("Mitochondrial genome annotation"). this.parser = ArgumentParsers.newFor("Mitochondrial genome annotation").
build().description("Specify parameters for annotation pipe line"); build().description("Specify parameters for annotation pipe line");
parser.addArgument("--debug").action(Arguments.storeTrue()).type(Boolean.class).setDefault(false).help("Only for debbuging purposes");
parser.addArgument("--sequence").type(String.class).help("Nucleotide sequence"); parser.addArgument("--sequence").type(String.class).help("Nucleotide sequence");
parser.addArgument("--fasta-file").type(String.class).help("path to fasta file. Multi fasta files are NOT supported"); parser.addArgument("--fasta-file").type(String.class).help("path to fasta file. Multi fasta files are NOT supported");
parser.addArgument("--name").type(String.class).setDefault("inputGenome").help("name of input genome"); parser.addArgument("--name").type(String.class).setDefault("inputGenome").help("name of input genome");
...@@ -75,93 +76,101 @@ public class DeGeCIInputParser extends InputParser { ...@@ -75,93 +76,101 @@ public class DeGeCIInputParser extends InputParser {
// System.out.println(TaxonomyGraph.getRecordsToFilteredTaxIds( ( taxids,false).size()); // System.out.println(TaxonomyGraph.getRecordsToFilteredTaxIds( ( taxids,false).size());
// System.out.println(res); // System.out.println(res);
if(res.getInt("trans_table") != null) { try {
DeGeCI.transTable = res.getInt("trans_table"); if (res.getBoolean("debug") != null) {
DeGeCI.shift = res.getInt("shift"); DeGeCI.DEBUG = res.getBoolean("debug");
} }
if(res.getString("sequence") == null & res.getString("fasta_file") == null) { if (res.getInt("trans_table") != null) {
LOGGER.error("Sequence must be supplied"); DeGeCI.transTable = res.getInt("trans_table");
System.exit(-1); DeGeCI.shift = res.getInt("shift");
} }
List<Integer> refseqRealses = Arrays.asList(89,204); if (res.getString("sequence") == null & res.getString("fasta_file") == null) {
if(!refseqRealses.contains(res.getInt("refseq"))){ LOGGER.error("Sequence must be supplied");
LOGGER.error("Invalid refseq release. Valid options are " + refseqRealses);
System.exit(-1);
}
TaxonomyGraph taxonomyGraph = null;
List<Integer> taxids = (List<Integer>)res.get("taxids");
List<Integer> filterRecords = null;
if(taxids != null || res.get("taxonomic_groups") != null){
LOGGER.error("Currently not yet implemented.");
System.exit(0);
taxonomyGraph = TaxonomyGraph.build();
}
if(taxids == null && res.get("taxonomic_groups") != null) {
taxids = taxonomyGraph.getTaxidsForTaxonomicName(res.get("taxonomic_groups"));
if(taxids == null) {
LOGGER.error("No taxids found for provided taxonomic groups");
System.exit(-1); System.exit(-1);
} }
} List<Integer> refseqRealses = Arrays.asList(89, 204);
if(taxids != null) { if (!refseqRealses.contains(res.getInt("refseq"))) {
LOGGER.info("Launching Taxonomic Filtering"); LOGGER.error("Invalid refseq release. Valid options are " + refseqRealses);
filterRecords = taxonomyGraph.getRecordsToFilteredTaxIds(taxids,res.getBoolean("exclude_taxids"));
LOGGER.info("Using " + filterRecords.size() + " genomes for mapping");
}
if(res.getString("sequence") != null) {
if(res.getDouble("alpha") < 0 | res.getDouble("alpha") >1){
LOGGER.error("Alpha must be between 0 and 1");
System.exit(-1); System.exit(-1);
} }
try { TaxonomyGraph taxonomyGraph = null;
if(taxids!= null) { List<Integer> taxids = (List<Integer>) res.get("taxids");
DeGeCI.annotateRefseq(res.getString("name"),res.getString("sequence"),res.getInt("taxid"), List<Integer> filterRecords = null;
!res.getBoolean("treat_linear"),res.getString("result_directory"), if (taxids != null || res.get("taxonomic_groups") != null) {
res.getInt("weight"),res.getInt("count"),16,res.getInt("refseq")+"", LOGGER.error("Currently not yet implemented.");
res.getDouble("alpha"),res.getBoolean("keep_intermediate_dataframes"),filterRecords System.exit(0);
); taxonomyGraph = TaxonomyGraph.build();
} }
else{ if (taxids == null && res.get("taxonomic_groups") != null) {
DeGeCI.annotateRefseq(res.getString("name"),res.getString("sequence"),res.getInt("taxid"), taxids = taxonomyGraph.getTaxidsForTaxonomicName(res.get("taxonomic_groups"));
!res.getBoolean("treat_linear"),res.getString("result_directory"), if (taxids == null) {
res.getInt("weight"),res.getInt("count"),16,res.getInt("refseq")+"", LOGGER.error("No taxids found for provided taxonomic groups");
res.getDouble("alpha"),res.getBoolean("keep_intermediate_dataframes") System.exit(-1);
);
} }
} catch (IOException e) {
e.printStackTrace();
} }
} if (taxids != null) {
else{ LOGGER.info("Launching Taxonomic Filtering");
File fastaFile = new File(res.getString("fasta_file")); filterRecords = taxonomyGraph.getRecordsToFilteredTaxIds(taxids, res.getBoolean("exclude_taxids"));
if(!fastaFile.exists()){ LOGGER.info("Using " + filterRecords.size() + " genomes for mapping");
LOGGER.info("Could not find fasta file " + fastaFile.getAbsolutePath());
System.exit(-1);
} }
try { if (res.getString("sequence") != null) {
if(taxids != null) { if (res.getDouble("alpha") < 0 | res.getDouble("alpha") > 1) {
DeGeCI.annotateRefseq(res.getString("name"),DeGeCI.readSingleFastaFile(fastaFile.getAbsolutePath()),res.getInt("taxid"), LOGGER.error("Alpha must be between 0 and 1");
!res.getBoolean("treat_linear"),res.getString("result_directory"), System.exit(-1);
res.getInt("weight"),res.getInt("count"),16,res.getInt("refseq")+"",
res.getDouble("alpha"),res.getBoolean("keep_intermediate_dataframes"),filterRecords
);
}
else{
DeGeCI.annotateRefseq(res.getString("name"),fastaFile,res.getInt("taxid"),
!res.getBoolean("treat_linear"),res.getString("result_directory"),
res.getInt("weight"),res.getInt("count"),16,res.getInt("refseq")+"",
res.getDouble("alpha"),res.getBoolean("keep_intermediate_dataframes")
);
} }
} catch (Exception e) { try {
e.printStackTrace(); if (taxids != null) {
DeGeCI.annotateRefseq(res.getString("name"), res.getString("sequence"), res.getInt("taxid"),
!res.getBoolean("treat_linear"), res.getString("result_directory"),
res.getInt("weight"), res.getInt("count"), 16, res.getInt("refseq") + "",
res.getDouble("alpha"), res.getBoolean("keep_intermediate_dataframes"), filterRecords
);
} else {
DeGeCI.annotateRefseq(res.getString("name"), res.getString("sequence"), res.getInt("taxid"),
!res.getBoolean("treat_linear"), res.getString("result_directory"),
res.getInt("weight"), res.getInt("count"), 16, res.getInt("refseq") + "",
res.getDouble("alpha"), res.getBoolean("keep_intermediate_dataframes")
);
}
} catch (IOException e) {
e.printStackTrace();
}
} else {
File fastaFile = new File(res.getString("fasta_file"));
if (!fastaFile.exists()) {
LOGGER.info("Could not find fasta file " + fastaFile.getAbsolutePath());
System.exit(-1);
}
try {
if (taxids != null) {
DeGeCI.annotateRefseq(res.getString("name"), DeGeCI.readSingleFastaFile(fastaFile.getAbsolutePath()), res.getInt("taxid"),
!res.getBoolean("treat_linear"), res.getString("result_directory"),
res.getInt("weight"), res.getInt("count"), 16, res.getInt("refseq") + "",
res.getDouble("alpha"), res.getBoolean("keep_intermediate_dataframes"), filterRecords
);
} else {
DeGeCI.annotateRefseq(res.getString("name"), fastaFile, res.getInt("taxid"),
!res.getBoolean("treat_linear"), res.getString("result_directory"),
res.getInt("weight"), res.getInt("count"), 16, res.getInt("refseq") + "",
res.getDouble("alpha"), res.getBoolean("keep_intermediate_dataframes")
);
}
} catch (Exception e) {
e.printStackTrace();
}
} }
} }
catch (Exception e) {
LOGGER.error(e.getMessage());
} finally {
DeGeCI.clear(res.getString("result_directory"),res.getInt("refseq")+"");
}
...@@ -169,9 +178,10 @@ public class DeGeCIInputParser extends InputParser { ...@@ -169,9 +178,10 @@ public class DeGeCIInputParser extends InputParser {
public static void main(String[] args) { public static void main(String[] args) {
new DeGeCIInputParser(args);
new DeGeCIInputParser(args);
System.exit(0); System.exit(0);
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment