BamTools  2.5.1
BamAlignment.h
Go to the documentation of this file.
1 // ***************************************************************************
2 // BamAlignment.h (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 25 July 2013 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
9 
10 #ifndef BAMALIGNMENT_H
11 #define BAMALIGNMENT_H
12 
13 #include <cstddef>
14 #include <cstdlib>
15 #include <cstring>
16 #include <string>
17 #include <vector>
18 #include "api/BamAux.h"
19 #include "api/BamConstants.h"
20 #include "api/api_global.h"
21 
22 namespace BamTools {
23 
25 // forward declaration of BamAlignment's "friends"
26 namespace Internal {
27 class BamReaderPrivate;
28 class BamWriterPrivate;
29 } // namespace Internal
31 
32 // BamAlignment data structure
34 {
35 
36  // constructors & destructor
37 public:
38  BamAlignment();
39  BamAlignment(const BamAlignment& other);
40  ~BamAlignment();
41 
42  // queries against alignment flags
43 public:
44  bool IsDuplicate() const; // returns true if this read is a PCR duplicate
45  bool IsFailedQC() const; // returns true if this read failed quality control
46  bool IsFirstMate() const; // returns true if alignment is first mate on read
47  bool IsMapped() const; // returns true if alignment is mapped
48  bool IsMateMapped() const; // returns true if alignment's mate is mapped
49  bool IsMateReverseStrand() const; // returns true if alignment's mate mapped to reverse strand
50  bool IsPaired() const; // returns true if alignment part of paired-end read
51  bool IsPrimaryAlignment() const; // returns true if reported position is primary alignment
52  bool IsProperPair()
53  const; // returns true if alignment is part of read that satisfied paired-end resolution
54  bool IsReverseStrand() const; // returns true if alignment mapped to reverse strand
55  bool IsSecondMate() const; // returns true if alignment is second mate on read
56 
57  // manipulate alignment flags
58 public:
59  void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
60  void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
61  void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
62  void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
63  void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
64  void SetIsMateReverseStrand(
65  bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
66  void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
67  void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
68  void SetIsProperPair(
69  bool
70  ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
71  void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
72  void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
73 
74  // tag data access methods
75 public:
76  // add a new tag
77  template <typename T>
78  bool AddTag(const std::string& tag, const std::string& type, const T& value);
79  template <typename T>
80  bool AddTag(const std::string& tag, const std::vector<T>& values);
81 
82  // edit (or append) tag
83  template <typename T>
84  bool EditTag(const std::string& tag, const std::string& type, const T& value);
85  template <typename T>
86  bool EditTag(const std::string& tag, const std::vector<T>& values);
87 
88  // retrieves tag data
89  template <typename T>
90  bool GetTag(const std::string& tag, T& destination) const;
91  template <typename T>
92  bool GetTag(const std::string& tag, std::vector<T>& destination) const;
93 
94  // retrieves all current tag names
95  std::vector<std::string> GetTagNames() const;
96 
97  // retrieves the SAM/BAM type-code for requested tag name
98  bool GetTagType(const std::string& tag, char& type) const;
99 
100  // retrieves the SAM/BAM type-code for the data elements in an array tag
101  bool GetArrayTagType(const std::string& tag, char& type) const;
102 
103  // returns true if alignment has a record for this tag name
104  bool HasTag(const std::string& tag) const;
105 
106  // removes a tag
107  void RemoveTag(const std::string& tag);
108 
109  // additional methods
110 public:
111  // populates alignment string fields
112  bool BuildCharData();
113 
114  // calculates alignment end position
115  int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
116 
117  // returns a description of the last error that occurred
118  std::string GetErrorString() const;
119 
120  // retrieves the size, read locations and reference locations of soft-clip operations
121  bool GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions,
122  std::vector<int>& genomePositions, bool usePadded = false) const;
123 
124  // public data fields
125 public:
126  std::string Name; // read name
127  int32_t Length; // length of query sequence
128  std::string QueryBases; // 'original' sequence (contained in BAM file)
129  std::string
130  AlignedBases; // 'aligned' sequence (QueryBases plus deletion, padding, clipping chars)
131  std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
132  std::string TagData; // tag data (use provided methods to query/modify)
133  int32_t RefID; // ID number for reference sequence
134  int32_t Position; // position (0-based) where alignment starts
135  uint16_t Bin; // BAM (standard) index bin number for this alignment
136  uint16_t MapQuality; // mapping quality score
137  uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
138  std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
139  int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
140  int32_t MatePosition; // position (0-based) where alignment's mate starts
141  int32_t InsertSize; // mate-pair insert size
142  std::string Filename; // name of BAM file which this alignment comes from
143 
145  // internal utility methods
146 private:
147  bool FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength,
148  unsigned int& numBytesParsed) const;
149  bool IsValidSize(const std::string& tag, const std::string& type) const;
150  void SetErrorString(const std::string& where, const std::string& what) const;
151  bool SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const;
153  // internal data
154 private:
155  struct BamAlignmentSupportData
156  {
157 
159  // data members
160  std::string AllCharData;
161  uint32_t BlockLength;
162  uint32_t NumCigarOperations;
163  uint32_t QueryNameLength;
164  uint32_t QuerySequenceLength;
165  bool HasCoreOnly;
166 
168  // constructor
169  BamAlignmentSupportData()
170  : BlockLength(0)
171  , NumCigarOperations(0)
172  , QueryNameLength(0)
173  , QuerySequenceLength(0)
174  , HasCoreOnly(false)
175  {}
176  };
177  BamAlignmentSupportData SupportData;
178  friend class Internal::BamReaderPrivate;
179  friend class Internal::BamWriterPrivate;
180 
181  mutable std::string ErrorString; // mutable to allow updates even in logically const methods
182 };
183 
184 // ---------------------------------------------------------
185 // BamAlignment tag access methods
186 
198 template <typename T>
199 inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value)
200 {
201 
202  // if char data not populated, do that first
203  if (SupportData.HasCoreOnly) BuildCharData();
204 
205  // check tag/type size
206  if (!IsValidSize(tag, type)) {
207  // TODO: set error string?
208  return false;
209  }
210 
211  // check that storage type code is OK for T
212  if (!TagTypeHelper<T>::CanConvertTo(type.at(0))) {
213  // TODO: set error string?
214  return false;
215  }
216 
217  // localize the tag data
218  char* pTagData = (char*)TagData.data();
219  const unsigned int tagDataLength = TagData.size();
220  unsigned int numBytesParsed = 0;
221 
222  // if tag already exists, return false
223  // use EditTag explicitly instead
224  if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
225  // TODO: set error string?
226  return false;
227  }
228 
229  // otherwise, convert value to string
230  union
231  {
232  T value;
233  char valueBuffer[sizeof(T)];
234  } un;
235  un.value = value;
236 
237  // copy original tag data to temp buffer
238  const std::string newTag = tag + type;
239  const std::size_t newTagDataLength =
240  tagDataLength + newTag.size() + sizeof(T); // leave room for new T
241  RaiiBuffer originalTagData(newTagDataLength);
242  memcpy(originalTagData.Buffer, TagData.c_str(),
243  tagDataLength + 1); // '+1' for TagData null-term
244 
245  // append newTag
246  strcat(originalTagData.Buffer + tagDataLength, newTag.data());
247  memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
248 
249  // store temp buffer back in TagData
250  const char* newTagData = (const char*)originalTagData.Buffer;
251  TagData.assign(newTagData, newTagDataLength);
252  return true;
253 }
254 
255 template <>
256 inline bool BamAlignment::AddTag<std::string>(const std::string& tag, const std::string& type,
257  const std::string& value)
258 {
259  // if char data not populated, do that first
260  if (SupportData.HasCoreOnly) BuildCharData();
261 
262  // check tag/type size
263  if (!IsValidSize(tag, type)) {
264  // TODO: set error string?
265  return false;
266  }
267 
268  // check that storage type code is OK for string
269  if (!TagTypeHelper<std::string>::CanConvertTo(type.at(0))) {
270  // TODO: set error string?
271  return false;
272  }
273 
274  // localize the tag data
275  char* pTagData = (char*)TagData.data();
276  const unsigned int tagDataLength = TagData.size();
277  unsigned int numBytesParsed = 0;
278 
279  // if tag already exists, return false
280  // use EditTag explicitly instead
281  if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
282  // TODO: set error string?
283  return false;
284  }
285 
286  // otherwise, copy tag data to temp buffer
287  const std::string newTag = tag + type + value;
288  const std::size_t newTagDataLength =
289  tagDataLength + newTag.size() + 1; // leave room for null-term
290  RaiiBuffer originalTagData(newTagDataLength);
291  memcpy(originalTagData.Buffer, TagData.c_str(),
292  tagDataLength + 1); // '+1' for TagData null-term
293 
294  // append newTag (removes original null-term, then appends newTag + null-term)
295  strcat(originalTagData.Buffer + tagDataLength, newTag.data());
296 
297  // store temp buffer back in TagData
298  const char* newTagData = (const char*)originalTagData.Buffer;
299  TagData.assign(newTagData, newTagDataLength);
300  return true;
301 }
302 
313 template <typename T>
314 inline bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values)
315 {
316 
317  // if char data not populated, do that first
318  if (SupportData.HasCoreOnly) BuildCharData();
319 
320  // check for valid tag name length
321  if (tag.size() != Constants::BAM_TAG_TAGSIZE) return false;
322 
323  // localize the tag data
324  char* pTagData = (char*)TagData.data();
325  const unsigned int tagDataLength = TagData.size();
326  unsigned int numBytesParsed = 0;
327 
328  // if tag already exists, return false
329  // use EditTag explicitly instead
330  if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
331  // TODO: set error string?
332  return false;
333  }
334 
335  // build new tag's base information
336  char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
337  memcpy(newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE);
338  newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
339  newTagBase[3] = TagTypeHelper<T>::TypeCode();
340 
341  // add number of array elements to newTagBase
342  const int32_t numElements = values.size();
343  memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
344 
345  // copy current TagData string to temp buffer, leaving room for new tag's contents
346  const std::size_t newTagDataLength =
347  tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE + numElements * sizeof(T);
348  RaiiBuffer originalTagData(newTagDataLength);
349  memcpy(originalTagData.Buffer, TagData.c_str(),
350  tagDataLength + 1); // '+1' for TagData's null-term
351 
352  // write newTagBase (removes old null term)
353  strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
354 
355  // add vector elements to tag
356  int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
357  for (int i = 0; i < numElements; ++i) {
358  const T& value = values.at(i);
359  memcpy(originalTagData.Buffer + elementsBeginOffset + i * sizeof(T), &value, sizeof(T));
360  }
361 
362  // store temp buffer back in TagData
363  const char* newTagData = (const char*)originalTagData.Buffer;
364  TagData.assign(newTagData, newTagDataLength);
365  return true;
366 }
367 
382 template <typename T>
383 inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value)
384 {
385 
386  // if char data not populated, do that first
387  if (SupportData.HasCoreOnly) BuildCharData();
388 
389  // remove existing tag if present, then append tag with new value
390  if (HasTag(tag)) RemoveTag(tag);
391  return AddTag(tag, type, value);
392 }
393 
405 template <typename T>
406 inline bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values)
407 {
408 
409  // if char data not populated, do that first
410  if (SupportData.HasCoreOnly) BuildCharData();
411 
412  // remove existing tag if present, then append tag with new values
413  if (HasTag(tag)) RemoveTag(tag);
414  return AddTag(tag, values);
415 }
416 
424 template <typename T>
425 inline bool BamAlignment::GetTag(const std::string& tag, T& destination) const
426 {
427 
428  // skip if alignment is core-only
429  if (SupportData.HasCoreOnly) {
430  // TODO: set error string?
431  return false;
432  }
433 
434  // skip if no tags present
435  if (TagData.empty()) {
436  // TODO: set error string?
437  return false;
438  }
439 
440  // localize the tag data
441  char* pTagData = (char*)TagData.data();
442  const unsigned int tagDataLength = TagData.size();
443  unsigned int numBytesParsed = 0;
444 
445  // return failure if tag not found
446  if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
447  // TODO: set error string?
448  return false;
449  }
450 
451  // fetch data type
452  const char type = *(pTagData - 1);
453  if (!TagTypeHelper<T>::CanConvertFrom(type)) {
454  // TODO: set error string ?
455  return false;
456  }
457 
458  // determine data length
459  int destinationLength = 0;
460  switch (type) {
461 
462  // 1 byte data
466  destinationLength = 1;
467  break;
468 
469  // 2 byte data
472  destinationLength = 2;
473  break;
474 
475  // 4 byte data
479  destinationLength = 4;
480  break;
481 
482  // var-length types not supported for numeric destination
486  SetErrorString("BamAlignment::GetTag",
487  "cannot store variable length tag data into a numeric destination");
488  return false;
489 
490  // unrecognized tag type
491  default:
492  const std::string message = std::string("invalid tag type: ") + type;
493  SetErrorString("BamAlignment::GetTag", message);
494  return false;
495  }
496 
497  // store data in destination
498  destination = 0;
499  memcpy(&destination, pTagData, destinationLength);
500 
501  // return success
502  return true;
503 }
504 
505 template <>
506 inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
507  std::string& destination) const
508 {
509  // skip if alignment is core-only
510  if (SupportData.HasCoreOnly) {
511  // TODO: set error string?
512  return false;
513  }
514 
515  // skip if no tags present
516  if (TagData.empty()) {
517  // TODO: set error string?
518  return false;
519  }
520 
521  // localize the tag data
522  char* pTagData = (char*)TagData.data();
523  const unsigned int tagDataLength = TagData.size();
524  unsigned int numBytesParsed = 0;
525 
526  // return failure if tag not found
527  if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
528  // TODO: set error string?
529  return false;
530  }
531 
532  // otherwise copy data into destination
533  const unsigned int dataLength = strlen(pTagData);
534  destination.clear();
535  destination.resize(dataLength);
536  memcpy((char*)destination.data(), pTagData, dataLength);
537 
538  // return success
539  return true;
540 }
541 
549 template <typename T>
550 inline bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const
551 {
552 
553  // skip if alignment is core-only
554  if (SupportData.HasCoreOnly) {
555  // TODO: set error string?
556  return false;
557  }
558 
559  // skip if no tags present
560  if (TagData.empty()) {
561  // TODO: set error string?
562  return false;
563  }
564 
565  // localize the tag data
566  char* pTagData = (char*)TagData.data();
567  const unsigned int tagDataLength = TagData.size();
568  unsigned int numBytesParsed = 0;
569 
570  // return false if tag not found
571  if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
572  // TODO: set error string?
573  return false;
574  }
575 
576  // check that tag is array type
577  const char tagType = *(pTagData - 1);
578  if (tagType != Constants::BAM_TAG_TYPE_ARRAY) {
579  SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
580  return false;
581  }
582 
583  // fetch element type
584  const char elementType = *pTagData;
585  if (!TagTypeHelper<T>::CanConvertFrom(elementType)) {
586  // TODO: set error string ?
587  return false;
588  }
589  ++pTagData;
590 
591  // calculate length of each element in tag's array
592  switch (elementType) {
596  break;
597 
600  break;
601 
605  break;
606 
607  // var-length types not supported for numeric destination
611  SetErrorString("BamAlignment::GetTag",
612  "invalid array data, variable-length elements are not allowed");
613  return false;
614 
615  // unknown tag type
616  default:
617  const std::string message = std::string("invalid array element type: ") + elementType;
618  SetErrorString("BamAlignment::GetTag", message);
619  return false;
620  }
621 
622  // get number of elements
623  int32_t numElements;
624  memcpy(&numElements, pTagData, sizeof(int32_t));
625  pTagData += 4;
626  destination.clear();
627  destination.reserve(numElements);
628 
629  // read in elements
630  T value;
631  for (int i = 0; i < numElements; ++i) {
632  memcpy(&value, pTagData, sizeof(T));
633  pTagData += sizeof(T);
634  destination.push_back(value);
635  }
636 
637  // return success
638  return true;
639 }
640 
641 typedef std::vector<BamAlignment> BamAlignmentVector;
642 
643 } // namespace BamTools
644 
645 #endif // BAMALIGNMENT_H
BamTools::Constants::BAM_TAG_TYPE_UINT8
const char BAM_TAG_TYPE_UINT8
Definition: BamConstants.h:75
BamTools::BamAlignment::Filename
std::string Filename
name of BAM file which this alignment comes from
Definition: BamAlignment.h:142
BamTools::BamAlignment::InsertSize
int32_t InsertSize
mate-pair insert size
Definition: BamAlignment.h:141
BamTools::BamAlignment::CigarData
std::vector< CigarOp > CigarData
CIGAR operations for this alignment.
Definition: BamAlignment.h:138
BamTools::BamAlignment::HasTag
bool HasTag(const std::string &tag) const
Returns true if alignment has a record for requested tag.
Definition: BamAlignment.cpp:722
BamTools::BamAlignment::TagData
std::string TagData
tag data (use the provided methods to query/modify)
Definition: BamAlignment.h:132
BamAux.h
BamTools::Constants::BAM_TAG_ARRAYBASE_SIZE
const uint8_t BAM_TAG_ARRAYBASE_SIZE
Definition: BamConstants.h:87
BamTools::Constants::BAM_TAG_TYPE_UINT16
const char BAM_TAG_TYPE_UINT16
Definition: BamConstants.h:77
BamTools::Constants::BAM_TAG_TYPE_INT32
const char BAM_TAG_TYPE_INT32
Definition: BamConstants.h:78
BamTools::BamAlignment::Bin
uint16_t Bin
BAM (standard) index bin number for this alignment.
Definition: BamAlignment.h:135
BamTools::BamAlignment::Qualities
std::string Qualities
FASTQ qualities (ASCII characters, not numeric values)
Definition: BamAlignment.h:131
BamTools::BamAlignment::AddTag
bool AddTag(const std::string &tag, const std::string &type, const T &value)
Definition: BamAlignment.h:199
BamTools::Constants::BAM_TAG_TYPE_INT8
const char BAM_TAG_TYPE_INT8
Definition: BamConstants.h:74
BamTools::BamAlignment::Name
std::string Name
read name
Definition: BamAlignment.h:126
BamTools::BamAlignment::RemoveTag
void RemoveTag(const std::string &tag)
Removes field from BAM tags.
Definition: BamAlignment.cpp:845
BamTools::Constants::BAM_TAG_TYPE_ASCII
const char BAM_TAG_TYPE_ASCII
Definition: BamConstants.h:73
BamTools::BamAlignment::AlignedBases
std::string AlignedBases
'aligned' sequence (includes any indels, padding, clipping)
Definition: BamAlignment.h:130
BamTools::BamAlignmentVector
std::vector< BamAlignment > BamAlignmentVector
Definition: BamAlignment.h:641
BamTools::Constants::BAM_TAG_TYPE_HEX
const char BAM_TAG_TYPE_HEX
Definition: BamConstants.h:82
BamTools::BamAlignment
The main BAM alignment data structure.
Definition: BamAlignment.h:33
BamConstants.h
BamTools::Constants::BAM_TAG_TYPE_ARRAY
const char BAM_TAG_TYPE_ARRAY
Definition: BamConstants.h:83
BamTools::BamAlignment::Length
int32_t Length
length of query sequence
Definition: BamAlignment.h:127
BamTools::BamAlignment::GetTag
bool GetTag(const std::string &tag, T &destination) const
Definition: BamAlignment.h:425
BamTools
Contains all BamTools classes & methods.
Definition: Sort.h:24
API_EXPORT
#define API_EXPORT
Definition: api_global.h:18
BamTools::BamAlignment::QueryBases
std::string QueryBases
'original' sequence (as reported from sequencing machine)
Definition: BamAlignment.h:128
BamTools::BamAlignment::Position
int32_t Position
position (0-based) where alignment starts
Definition: BamAlignment.h:134
BamTools::BamAlignment::MateRefID
int32_t MateRefID
ID number for reference sequence where alignment's mate was aligned.
Definition: BamAlignment.h:139
BamTools::BamAlignment::RefID
int32_t RefID
ID number for reference sequence.
Definition: BamAlignment.h:133
BamTools::Constants::BAM_TAG_TYPE_STRING
const char BAM_TAG_TYPE_STRING
Definition: BamConstants.h:81
BamTools::Constants::BAM_TAG_TAGSIZE
const uint8_t BAM_TAG_TAGSIZE
Definition: BamConstants.h:85
api_global.h
BamTools::Constants::BAM_TAG_TYPE_INT16
const char BAM_TAG_TYPE_INT16
Definition: BamConstants.h:76
BamTools::BamAlignment::EditTag
bool EditTag(const std::string &tag, const std::string &type, const T &value)
Definition: BamAlignment.h:383
BamTools::Constants::BAM_TAG_TYPE_UINT32
const char BAM_TAG_TYPE_UINT32
Definition: BamConstants.h:79
BamTools::BamAlignment::MapQuality
uint16_t MapQuality
mapping quality score
Definition: BamAlignment.h:136
BamTools::BamAlignment::BuildCharData
bool BuildCharData()
Populates alignment string fields (read name, bases, qualities, tag data).
Definition: BamAlignment.cpp:135
BamTools::BamAlignment::AlignmentFlag
uint32_t AlignmentFlag
alignment bit-flag (use the provided methods to query/modify)
Definition: BamAlignment.h:137
BamTools::BamAlignment::MatePosition
int32_t MatePosition
position (0-based) where alignment's mate starts
Definition: BamAlignment.h:140
BamTools::Constants::BAM_TAG_TYPE_FLOAT
const char BAM_TAG_TYPE_FLOAT
Definition: BamConstants.h:80