22#include "BaseUtilities.h"
38 myMinReadLength(minReadLength),
39 myNumPrintableErrors(numPrintableErrors),
41 myDisableMessages(false),
51 myDisableMessages =
true;
57 myDisableMessages =
false;
87 myMaxErrors = maxErrors;
98 myBaseComposition.resetBaseMapType();
99 myBaseComposition.setBaseMapType(spaceType);
100 myQualPerCycle.clear();
101 myCountPerCycle.clear();
112 myFile =
ifopen(fileName,
"rt");
113 myFileName = fileName;
125 std::string errorMessage =
"ERROR: Failed to open file: ";
126 errorMessage += fileName;
127 logMessage(errorMessage.c_str());
153 std::string errorMessage =
"Failed to close file: ";
154 errorMessage += myFileName.c_str();
155 logMessage(errorMessage.c_str());
165 if((myFile != NULL) && (myFile->isOpen()))
180 if((myFile != NULL) && (
ifeof(myFile)))
195 if(
isEof() || myFileProblem)
217 int numSequences = 0;
225 ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
246 myBaseComposition.print();
254 std::string finishMessage =
"Finished processing ";
255 finishMessage += myFileName.c_str();
258 " with %u lines containing %d sequences.",
259 myLineNum, numSequences) > 0)
261 finishMessage += buffer;
262 logMessage(finishMessage.c_str());
265 "There were a total of %d errors.",
281 else if(myNumErrors == 0)
286 if(numSequences == 0)
289 logMessage(
"ERROR: No FastQSequences in the file.");
314 std::string message =
315 "ERROR: Trying to read a fastq file but no file is open.";
316 logMessage(message.c_str());
321 resetForEachSequence();
332 valid = validateSequenceIdentifierLine();
345 if(mySequenceIdLine.Length() != 0)
349 myErrorString =
"Incomplete Sequence.\n";
374 valid &= validateRawSequenceAndPlusLines();
392 myErrorString =
"Incomplete Sequence, missing Quality String.";
399 valid &= validateQualityStringLines();
415bool FastQFile::validateSequenceIdentifierLine()
418 int readStatus = mySequenceIdLine.ReadLine(myFile);
428 myFileProblem =
true;
429 myErrorString =
"Failure trying to read sequence identifier line";
436 if((mySequenceIdLine.Length() == 0) && (
ifeof(myFile)))
447 if(mySequenceIdLine.Length() < 2)
450 myErrorString =
"The sequence identifier line was too short.";
456 if(mySequenceIdLine[0] !=
'@')
459 myErrorString =
"First line of a sequence does not begin with @";
470 int endSequenceIdentifier = mySequenceIdLine.FastFindChar(
' ', 1);
473 if(endSequenceIdentifier == -1)
477 mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
484 if(endSequenceIdentifier <= 1)
487 "No Sequence Identifier specified before the comment.";
492 mySequenceIdentifier =
493 (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
497 if(myInterleaved && (myPrevSeqID !=
""))
501 if(myPrevSeqID.compare(mySequenceIdentifier) != 0)
504 if((myPrevSeqID.compare(0, myPrevSeqID.length()-1, mySequenceIdentifier.c_str(), mySequenceIdentifier.Length()-1) != 0) ||
505 (((myPrevSeqID[myPrevSeqID.length()-1] !=
'1') || (mySequenceIdentifier[mySequenceIdentifier.Length()-1] !=
'2')) &&
506 (myPrevSeqID[myPrevSeqID.length()-1] != mySequenceIdentifier[mySequenceIdentifier.Length()-1])))
508 myErrorString =
"Interleaved: consecutive reads do not have matching sequence identifiers: ";
509 myErrorString += mySequenceIdentifier.c_str();
510 myErrorString +=
" and ";
511 myErrorString += myPrevSeqID.c_str();
523 myPrevSeqID = mySequenceIdentifier.c_str();
532 std::pair<std::map<std::string, unsigned int>::iterator,
bool> insertResult;
534 myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
537 if(insertResult.second ==
false)
540 myErrorString =
"Repeated Sequence Identifier: ";
541 myErrorString += mySequenceIdentifier.c_str();
542 myErrorString +=
" at Lines ";
543 myErrorString += insertResult.first->second;
544 myErrorString +=
" and ";
545 myErrorString += myLineNum;
562bool FastQFile::validateRawSequenceAndPlusLines()
565 int readStatus = myRawSequence.ReadLine(myFile);
571 myFileProblem =
true;
572 myErrorString =
"Failure trying to read sequence line";
581 bool valid = validateRawSequence(offset);
584 offset = myRawSequence.Length();
589 bool stillRawLine =
true;
590 while(stillRawLine &&
602 readStatus = myPlusLine.ReadLine(myFile);
607 myFileProblem =
true;
608 myErrorString =
"Failure trying to read sequence/plus line";
614 if(myPlusLine.Length() == 0)
619 "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
623 else if(myPlusLine[0] ==
'+')
626 valid &= validateSequencePlus();
627 stillRawLine =
false;
634 myRawSequence += myPlusLine;
635 myPlusLine.SetLength(0);
636 valid &= validateRawSequence(offset);
639 offset = myRawSequence.Length();
651 if(myRawSequence.Length() < myMinReadLength)
654 myErrorString =
"Raw Sequence is shorter than the min read length: ";
655 myErrorString += myRawSequence.Length();
656 myErrorString +=
" < ";
657 myErrorString += myMinReadLength;
672 myErrorString =
"Reached the end of the file without a '+' line.";
682bool FastQFile::validateQualityStringLines()
685 int readStatus = myQualityString.ReadLine(myFile);
690 myFileProblem =
true;
691 myErrorString =
"Failure trying to read quality line";
700 bool valid = validateQualityString(offset);
702 offset = myQualityString.Length();
706 while((myQualityString.Length() < myRawSequence.Length()) &&
716 readStatus = myTempPartialQuality.ReadLine(myFile);
721 myFileProblem =
true;
722 myErrorString =
"Failure trying to read quality line";
727 myQualityString += myTempPartialQuality;
728 myTempPartialQuality.Clear();
731 valid &= validateQualityString(offset);
732 offset = myQualityString.Length();
743 if(myQualityString.Length() != myRawSequence.Length())
745 myErrorString =
"Quality string length (";
746 myErrorString += myQualityString.Length();
747 myErrorString +=
") does not equal raw sequence length (";
748 myErrorString += myRawSequence.Length();
749 myErrorString +=
")";
758bool FastQFile::validateRawSequence(
int offset)
760 bool validBase =
false;
764 for(
int sequenceIndex = offset; sequenceIndex < myRawSequence.Length();
770 myBaseComposition.updateComposition(sequenceIndex,
771 myRawSequence[sequenceIndex]);
776 myErrorString =
"Invalid character ('";
777 myErrorString += myRawSequence[sequenceIndex];
778 myErrorString +=
"') in base sequence.";
794bool FastQFile::validateSequencePlus()
800 int lineLength = myPlusLine.Length();
805 if((lineLength == 1) || (myPlusLine[1] ==
' '))
817 int sequenceIdentifierLength = mySequenceIdentifier.Length();
818 if(lineLength <= sequenceIdentifierLength)
821 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
833 while((same ==
true) && (seqIndex < sequenceIdentifierLength))
835 if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
838 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
850bool FastQFile::validateQualityString(
int offset)
853 if(myQualityString.Length() > (
int)(myQualPerCycle.size()))
855 myQualPerCycle.resize(myQualityString.Length());
856 myCountPerCycle.resize(myQualityString.Length());
859 for(
int i=offset; i < myQualityString.Length(); i++)
861 if(myQualityString[i] <= 32)
863 myErrorString =
"Invalid character ('";
864 myErrorString += myQualityString[i];
865 myErrorString +=
"') in quality string.";
877 myCountPerCycle[i] += 1;
887void FastQFile::reportErrorOnLine()
893 if(myNumErrors <= myNumPrintableErrors)
897 sprintf(buffer,
"ERROR on Line %u: ", myLineNum);
898 std::string message = buffer;
899 message += myErrorString.c_str();
900 logMessage(message.c_str());
906void FastQFile::reset()
910 resetForEachSequence();
913 myFileName.SetLength(0);
914 myIdentifierMap.clear();
915 myBaseComposition.clear();
916 myQualPerCycle.clear();
917 myCountPerCycle.clear();
918 myFileProblem =
false;
923void FastQFile::resetForEachSequence()
925 myLineBuffer.SetLength(0);
926 myErrorString.SetLength(0);
927 myRawSequence.SetLength(0);
928 mySequenceIdLine.SetLength(0);
929 mySequenceIdentifier.SetLength(0);
930 myPlusLine.SetLength(0);
931 myQualityString.SetLength(0);
932 myTempPartialQuality.SetLength(0);
936void FastQFile::logMessage(
const char* logMessage)
939 if(!myDisableMessages)
941 std::cout << logMessage << std::endl;
948bool FastQFile::isTimeToQuit()
952 if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
960void FastQFile::printAvgQual()
962 std::cout << std::endl <<
"Average Phred Quality by Read Index (starts at 0):" << std::endl;
963 std::cout.precision(2);
964 std::cout << std::fixed <<
"Read Index\tAverage Quality"
966 if(myQualPerCycle.size() != myCountPerCycle.size())
969 std::cerr <<
"ERROR calculating the average Qualities per cycle\n";
975 for(
unsigned int i = 0; i < myQualPerCycle.size(); i++)
978 if(myCountPerCycle[i] != 0)
980 avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]);
982 std::cout << i <<
"\t" << avgQual <<
"\n";
983 sumQual += myQualPerCycle[i];
984 count += myCountPerCycle[i];
986 std::cout << std::endl;
990 avgQual = sumQual / count;
992 std::cout <<
"Overall Average Phred Quality = " << avgQual << std::endl;
SPACE_TYPE
The type of space (color or base) to use in the mapping.
static uint8_t getPhredBaseQuality(char charQuality)
Get phred base quality from the specified ascii quality.
void interleaved()
Interleaved.
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
void enableSeqIDCheck()
Enable Unique Sequence ID checking.
void disableMessages()
Disable messages - do not write to cout.
bool isOpen()
Check to see if the file is open.
void disableSeqIDCheck()
Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
FastQStatus::Status closeFile()
Close a FastQFile.
FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
Validate the specified fastq file.
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
void enableMessages()
Enable messages - write to cout.
void setMaxErrors(int maxErrors)
Set the number of errors after which to quit reading/validating a file, defaults to -1.
FastQFile(int minReadLength=10, int numPrintableErrors=20)
Constructor.
bool isEof()
Check to see if the file is at the end of the file.
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
@ FASTQ_ORDER_ERROR
means the methods are called out of order, like trying to read a file before opening it.
@ FASTQ_READ_ERROR
means that a problem occurred on a read.
@ FASTQ_SUCCESS
indicates method finished successfully.
@ FASTQ_INVALID
means that the sequence was invalid.
@ FASTQ_OPEN_ERROR
means the file could not be opened.
@ FASTQ_NO_SEQUENCE_ERROR
means there were no errors, but no sequences read.
@ FASTQ_CLOSE_ERROR
means the file could not be closed.