CompactTree
compact_tree.h
1 
5 #ifndef COMPACT_TREE_H
6 #define COMPACT_TREE_H
7 
8 // include statements
9 #include <cstdint> // std::uint32_t, std::uint64_t
10 #include <cstdlib> // std::abs, std::atof
11 #include <cstring> // strcmp()
12 #include <fcntl.h> // O_RDONLY, open(), posix_fadvise()
13 #include <iostream> // std::ostream
14 #include <utility> // std::make_pair, std::pair
15 #include <queue> // std::queue
16 #include <sstream> // std::ostringstream
17 #include <stack> // std::stack
18 #include <stdexcept> // std::invalid_argument
19 #include <string> // std::string
20 #include <tuple> // std::tuple
21 #include <unistd.h> // read()
22 #include <unordered_map> // std::unordered_map
23 #include <unordered_set> // std::unordered_set
24 #include <utility> // std::pair
25 #include <vector> // std::vector
26 
27 // define node type, which is a fixed-width unsigned integer (default is 32-bit)
28 #if defined CT_NODE_64
29 #define CT_NODE_T std::uint64_t
30 #else
31 #define CT_NODE_T std::uint32_t
32 #endif
33 
34 // define edge length type, which is a floating point number (default is float)
35 #if defined CT_LENGTH_DOUBLE
36 #define CT_LENGTH_T double
37 #define PARSE_LENGTH_FROM_C_STR std::atof
38 #define PARSE_LENGTH_FROM_STRING std::stod
39 #else
40 #define CT_LENGTH_T float
41 #define PARSE_LENGTH_FROM_C_STR (float)std::atof
42 #define PARSE_LENGTH_FROM_STRING std::stof
43 #endif
44 
45 // general constants
46 #define COMPACTTREE_VERSION "0.0.7"
47 #define IO_BUFFER_SIZE 16384
48 #define STR_BUFFER_SIZE 16384
49 const std::string EMPTY_STRING = "";
50 const CT_NODE_T ROOT_NODE = (CT_NODE_T)0;
51 const CT_NODE_T NULL_NODE = (CT_NODE_T)(-1);
52 const CT_LENGTH_T ZERO_LENGTH = (CT_LENGTH_T)0.;
53 const CT_LENGTH_T ZERO_THRESH = (CT_LENGTH_T)0.000001;
54 
55 // error messages
56 const std::string ERROR_OPENING_FILE = "Error opening file";
57 const std::string ERROR_INVALID_NEWICK_FILE = "Invalid Newick file";
58 const std::string ERROR_INVALID_NEWICK_STRING = "Invalid Newick string";
59 
60 // compact_tree class
61 class compact_tree {
62  private:
66  std::vector<CT_NODE_T> parent; // `parent[i]` is the parent of node `i`
67  std::vector<std::vector<CT_NODE_T>> children; // `children[i]` is a `vector` containing the children of node `i`
68  std::vector<std::string> label; // `label[i]` is the label of node `i`
69  std::vector<CT_LENGTH_T> length; // `length[i]` is the length of the edge incident to (i.e., going into) node `i`
70 
74  size_t num_leaves = 0; // cache the total number of leaves (avoid recalculation)
75  CT_NODE_T tmp_node; // temporary holding variable for nodes (e.g. new child); value should only be used immediately after assigning
76 
80  compact_tree() {}
81 
89  CT_NODE_T create_child(const CT_NODE_T parent_node, bool store_labels, bool store_lengths);
90 
94  void calc_num_leaves() { CT_NODE_T num_nodes = parent.size(); for(CT_NODE_T curr = 0; curr < num_nodes; ++curr) { if(is_leaf(curr)) { num_leaves += 1; } } }
95 
96  public:
105  compact_tree(const char* const input, bool is_fn = true, bool store_labels = true, bool store_lengths = true, size_t reserve = 0);
106 
115  compact_tree(const std::string & input, bool is_fn = true, bool store_labels = true, bool store_lengths = true, size_t reserve = 0) : compact_tree(&input[0], is_fn, store_labels, store_lengths, reserve) {}
116 
121  compact_tree(const compact_tree & o) : parent(o.parent), children(o.children), label(o.label), length(o.length), num_leaves(o.num_leaves) {}
122 
130  CT_NODE_T add_child(const CT_NODE_T parent_node, const std::string & new_label = EMPTY_STRING, CT_LENGTH_T new_length = ZERO_LENGTH) { if((num_leaves != 0) && !is_leaf(parent_node)) { ++num_leaves; } CT_NODE_T x = create_child(parent_node,has_labels(),has_edge_lengths()); if(!(new_label.empty())) { set_label(x,new_label); } if(std::abs(new_length) > ZERO_THRESH) { set_edge_length(x,new_length); } return x; }
131 
138  void print_newick(std::ostream & out, CT_NODE_T node = ROOT_NODE, bool include_semicolon = true);
139 
145  std::string get_newick(CT_NODE_T node = ROOT_NODE, bool include_semicolon = true) { std::ostringstream oss; print_newick(oss,node,include_semicolon); return oss.str(); }
146 
151  size_t get_num_nodes() const { return parent.size(); }
152 
157  size_t get_num_leaves() { if(num_leaves == 0) { calc_num_leaves(); } return num_leaves; }
158 
163  size_t get_num_internal() { return get_num_nodes() - get_num_leaves(); }
164 
169  CT_NODE_T get_root() const { return ROOT_NODE; }
170 
176  bool is_root(CT_NODE_T node) const { return node == ROOT_NODE; }
177 
183  bool is_leaf(CT_NODE_T node) const { return children[node].size() == 0; }
184 
190  CT_NODE_T get_parent(CT_NODE_T node) const { return parent[node]; }
191 
197  const std::vector<CT_NODE_T> & get_children(CT_NODE_T node) const { return children[node]; }
198 
203  bool has_edge_lengths() const { return length.size() != 0; }
204 
209  void clear_edge_lengths(bool shrink = true) { if(!length.empty()) { length.clear(); if(shrink) { length.shrink_to_fit(); } } }
210 
216  CT_LENGTH_T get_edge_length(CT_NODE_T node) const { return has_edge_lengths() ? length[node] : (CT_LENGTH_T)0.; }
217 
221  const std::vector<CT_LENGTH_T> & get_edge_lengths() const { return length; }
222 
228  void set_edge_length(CT_NODE_T node, CT_LENGTH_T new_length) {
229  if(!has_edge_lengths()) { length = std::vector<CT_LENGTH_T>(get_num_nodes(), (CT_LENGTH_T)0.); } length[node] = new_length;
230  }
231 
236  bool has_labels() const { return label.size() != 0; }
237 
242  void clear_labels(bool shrink = true) { if(!label.empty()) { label.clear(); if(shrink) { label.shrink_to_fit(); } } }
243 
249  const std::string & get_label(CT_NODE_T node) const { return has_labels() ? label[node] : EMPTY_STRING; }
250 
256  void set_label(CT_NODE_T node, const std::string & new_label) {
257  if(!has_labels()) { label = std::vector<std::string>(get_num_nodes(), ""); } label[node] = new_label;
258  }
259 
264  const std::vector<std::string> & get_labels() const { return label; }
265 
271  void replace_labels(const std::unordered_map<std::string, std::string> & label_map, bool include_internal = true) {
272  const size_t NUM_NODES = get_num_nodes(); CT_NODE_T curr; std::unordered_map<std::string,std::string>::const_iterator it; std::unordered_map<std::string,std::string>::const_iterator it_end = label_map.cend();
273  for(curr = ROOT_NODE; curr < NUM_NODES; ++curr) {
274  if(is_leaf(curr) || include_internal) {
275  it = label_map.find(label[curr]);
276  if(it != it_end) { label[curr] = it->second; }
277  }
278  }
279  }
280 
286  std::tuple<const std::string*, CT_LENGTH_T, CT_NODE_T, const std::vector<CT_NODE_T>*> operator[](CT_NODE_T i) const { return std::make_tuple(&get_label(i), get_edge_length(i), parent[i], &children[i]); }
287 
292  class preorder_iterator : public std::iterator<std::input_iterator_tag, CT_NODE_T> {
293  private:
294  CT_NODE_T node;
295  public:
296  preorder_iterator(CT_NODE_T x) : node(x) {}
297  preorder_iterator(const preorder_iterator & it) : node(it.node) {}
298  preorder_iterator & operator=(const preorder_iterator & it) { node = it.node; return *this; }
299  preorder_iterator & operator++() { ++node; return *this; }
300  preorder_iterator operator++(int) { preorder_iterator tmp(*this); operator++(); return tmp; }
301  bool operator==(const preorder_iterator & rhs) const { return node == rhs.node; }
302  bool operator!=(const preorder_iterator & rhs) const { return node != rhs.node; }
303  CT_NODE_T operator*() { return node; }
304  };
305 
311 
317 
322  class postorder_iterator : public std::iterator<std::input_iterator_tag, CT_NODE_T> {
323  private:
324  CT_NODE_T node;
325  public:
326  postorder_iterator(CT_NODE_T x) : node(x) {}
327  postorder_iterator(const postorder_iterator & it) : node(it.node) {}
328  postorder_iterator & operator=(const postorder_iterator & it) { node = it.node; return *this; }
329  postorder_iterator & operator++() { --node; return *this; }
330  postorder_iterator operator++(int) { postorder_iterator tmp(*this); operator++(); return tmp; }
331  bool operator==(const postorder_iterator & rhs) const { return node == rhs.node; }
332  bool operator!=(const postorder_iterator & rhs) const { return node != rhs.node; }
333  CT_NODE_T operator*() { return node; }
334  };
340 
346 
352  class levelorder_iterator : public std::iterator<std::input_iterator_tag, CT_NODE_T> {
353  private:
354  std::queue<CT_NODE_T> q; compact_tree* tree_ptr;
355  public:
356  levelorder_iterator(compact_tree* const tp) : tree_ptr(tp) {}
357  levelorder_iterator(const CT_NODE_T root, compact_tree* const tp) : tree_ptr(tp) { q.push(root); }
358  levelorder_iterator(std::queue<CT_NODE_T> x, compact_tree* const tp) : q(x), tree_ptr(tp) {}
359  levelorder_iterator(const levelorder_iterator & o) : q(o.q), tree_ptr(o.tree_ptr) {}
360  levelorder_iterator & operator=(const levelorder_iterator & o) { q = o.q; tree_ptr = o.tree_ptr; return *this; }
361  levelorder_iterator & operator++() { CT_NODE_T x = q.front(); std::vector<CT_NODE_T>::iterator it_end = tree_ptr->children[x].end(); for(std::vector<CT_NODE_T>::iterator it = tree_ptr->children[x].begin(); it != it_end; ++it) { q.push(*it); } q.pop(); return *this; }
362  levelorder_iterator operator++(int) { levelorder_iterator tmp(*this); operator++(); return tmp; }
363  bool operator==(const levelorder_iterator & rhs) const { return q == rhs.q; }
364  bool operator!=(const levelorder_iterator & rhs) const { return q != rhs.q; }
365  CT_NODE_T operator*() { return q.front(); }
366  };
367 
373 
379 
384  class leaves_iterator : public std::iterator<std::input_iterator_tag, CT_NODE_T> {
385  private:
386  CT_NODE_T node; const compact_tree* tree_ptr;
387  public:
388  leaves_iterator(CT_NODE_T x, const compact_tree* const tp) : node(x), tree_ptr(tp) {}
389  leaves_iterator(const leaves_iterator & it) : node(it.node), tree_ptr(it.tree_ptr) {}
390  leaves_iterator & operator=(const leaves_iterator & it) { node = it.node; tree_ptr = it.tree_ptr; return *this; }
391  leaves_iterator & operator++() { CT_NODE_T num_nodes = tree_ptr->get_num_nodes(); while(((++node) < num_nodes) && !(tree_ptr->is_leaf(node))) {} return *this; }
392  leaves_iterator operator++(int) { leaves_iterator tmp(*this); operator++(); return tmp; }
393  bool operator==(const leaves_iterator & rhs) const { return node == rhs.node; }
394  bool operator!=(const leaves_iterator & rhs) const { return node != rhs.node; }
395  CT_NODE_T operator*() { return node; }
396  };
397 
402  leaves_iterator leaves_begin() { CT_NODE_T node = ROOT_NODE; CT_NODE_T num_nodes = get_num_nodes(); while(((++node) < num_nodes) && (!is_leaf(node))) {} return leaves_iterator(node, this); }
403 
408  leaves_iterator leaves_end() { return leaves_iterator((CT_NODE_T)get_num_nodes(), this); }
409 
414  class children_iterator : public std::iterator<std::input_iterator_tag, CT_NODE_T> {
415  private:
416  std::vector<CT_NODE_T>::iterator it;
417  public:
418  children_iterator(std::vector<CT_NODE_T>::iterator x) : it(x) {}
419  children_iterator(const children_iterator & o) : it(o.it) {}
420  children_iterator & operator=(const children_iterator & o) { it = o.it; return *this; }
421  children_iterator & operator++() { ++it; return *this; }
422  children_iterator operator++(int) { children_iterator tmp(*this); operator++(); return tmp; }
423  bool operator==(const children_iterator & rhs) const { return it == rhs.it; }
424  bool operator!=(const children_iterator & rhs) const { return it != rhs.it; }
425  CT_NODE_T operator*() { return *it; }
426  };
427 
433  children_iterator children_begin(CT_NODE_T node) { return children_iterator(children[node].begin()); }
434 
440  children_iterator children_end(CT_NODE_T node) { return children_iterator(children[node].end()); }
441 
447  CT_NODE_T find_mrca(const std::unordered_set<CT_NODE_T> & nodes) const;
448 
454  compact_tree extract_subtree(CT_NODE_T node);
455 
462  double calc_total_bl(bool include_internal = true, bool include_leaves = true) const {
463  if(!(include_internal || include_leaves)) { return 0.; }
464  double tot = 0; CT_NODE_T num_nodes = get_num_nodes();
465  for(CT_NODE_T node = 0; node < num_nodes; ++node) {
466  if(is_leaf(node)) { if(include_leaves) { tot += get_edge_length(node); } }
467  else { if(include_internal) { tot += get_edge_length(node); } }
468  }
469  return tot;
470  }
471 
478  double calc_avg_bl(bool include_internal = true, bool include_leaves = true) const {
479  if(!(include_internal || include_leaves)) { return 0.; }
480  double tot = 0; CT_NODE_T num_nodes = get_num_nodes(); CT_NODE_T count = 0;
481  for(CT_NODE_T node = 0; node < num_nodes; ++node) {
482  if(is_leaf(node)) { if(include_leaves) { tot += get_edge_length(node); ++count; } }
483  else { if(include_internal) { tot += get_edge_length(node); ++count; } }
484  }
485  return tot / count;
486  }
487 
494  double calc_dist(CT_NODE_T u, CT_NODE_T v) const {
495  if(u == v) { return 0.; } if(u == parent[v]) { return get_edge_length(v); } if(v == parent[u]) { return get_edge_length(u); }
496  std::unordered_map<CT_NODE_T, double> u_dists; std::unordered_map<CT_NODE_T, double> v_dists; CT_NODE_T c = u; CT_NODE_T p = parent[c]; std::unordered_map<CT_NODE_T, double>::iterator it;
497  while(p != NULL_NODE) { u_dists[p] = u_dists[c] + get_edge_length(c); c = p; p = parent[p]; } c = v; p = parent[c];
498  while(p != NULL_NODE) { v_dists[p] = v_dists[c] + get_edge_length(c); it = u_dists.find(p); if(it != u_dists.end()) { return it->second + v_dists[p]; } c = p; p = parent[p]; }
499  return -1.; // error, shouldn't reach here
500  }
501 
506  std::vector<std::pair<std::pair<CT_NODE_T,CT_NODE_T>, double>> calc_distance_matrix();
507 };
508 
509 // print Newick string (currently recursive)
510 void compact_tree::print_newick(std::ostream & out, CT_NODE_T node, bool include_semicolon) {
511  std::vector<CT_NODE_T>::iterator it_begin = children[node].begin(); std::vector<CT_NODE_T>::iterator it_end = children[node].end();
512  for(std::vector<CT_NODE_T>::iterator it = it_begin; it != it_end; ++it) { out << ((it == it_begin) ? '(' : ','); print_newick(out, *it, false); }
513  if(!is_leaf(node)) { out << ')'; }
514  if(label.size() != 0) { out << label[node]; }
515  if(length.size() != 0) { out << ':' << length[node]; }
516  if(include_semicolon) { out << ';'; }
517 }
518 
519 // find the MRCA of nodes
520 CT_NODE_T compact_tree::find_mrca(const std::unordered_set<CT_NODE_T> & nodes) const {
521  std::queue<CT_NODE_T, std::deque<CT_NODE_T>> to_visit(std::deque<CT_NODE_T>(nodes.begin(), nodes.end()));
522  std::unordered_map<CT_NODE_T, CT_NODE_T> count; std::unordered_map<CT_NODE_T, CT_NODE_T>::iterator count_it;
523  CT_NODE_T curr_node; CT_NODE_T curr_parent; size_t total = nodes.size();
524  while(!to_visit.empty()) {
525  curr_node = to_visit.front(); to_visit.pop(); count_it = count.find(curr_node); curr_parent = parent[curr_node];
526  if(count_it == count.end()) { // current node hasn't been seen before, initialize its count to 0
527  count_it = count.emplace(curr_node, 0).first;
528  }
529  if((++(count_it->second)) == total) {
530  return curr_node;
531  }
532  if(curr_parent != NULL_NODE) {
533  to_visit.emplace(curr_parent);
534  }
535  }
536  return NULL_NODE; // shouldn't ever reach here
537 }
538 
539 // calculate distance matrix
540 std::vector<std::pair<std::pair<CT_NODE_T,CT_NODE_T>, double>> compact_tree::calc_distance_matrix() {
541  // set things up
542  const size_t NUM_NODES = get_num_nodes(); const size_t N = get_num_leaves(); const size_t N_MINUS_1 = N - 1;
543  const size_t N_CHOOSE_2 = ((N%2) == 0) ? ((N/2) * N_MINUS_1) : ((N_MINUS_1/2) * N);
544  std::vector<std::pair<std::pair<CT_NODE_T,CT_NODE_T>, double>> dm; dm.reserve(N_CHOOSE_2);
545  std::unordered_map<CT_NODE_T, std::unordered_map<CT_NODE_T, double>*> leaf_dists;
546  std::vector<CT_NODE_T>::iterator ch_it; std::vector<CT_NODE_T>::iterator ch_it_2; std::vector<CT_NODE_T>::iterator ch_it_end;
547  CT_NODE_T curr; CT_NODE_T child; CT_NODE_T child_2; std::unordered_map<CT_NODE_T, double>* curr_leaf_dists;
548  std::unordered_map<CT_NODE_T, double>* child_leaf_dists; std::unordered_map<CT_NODE_T, double>::iterator dist_it; std::unordered_map<CT_NODE_T, double>::iterator dist_it_end;
549  std::unordered_map<CT_NODE_T, double>* child_leaf_dists_2; std::unordered_map<CT_NODE_T, double>::iterator dist_it_2; std::unordered_map<CT_NODE_T, double>::iterator dist_it_2_end;
550 
551  // calculate pairwise distances
552  for(curr = NUM_NODES-1; curr != NULL_NODE; --curr) {
553  curr_leaf_dists = new std::unordered_map<CT_NODE_T, double>();
554  leaf_dists.emplace(curr, curr_leaf_dists);
555 
556  // for leaves, they have 0 distance to themselves
557  if(is_leaf(curr)) {
558  curr_leaf_dists->emplace(curr, (double)0.);
559  }
560 
561  // for internal nodes:
562  else {
563  // calculate all pairwise distances between leaves below this node
564  for(ch_it = children[curr].begin(), ch_it_end = children[curr].end(); std::next(ch_it) != ch_it_end; ++ch_it) {
565  child = *ch_it; child_leaf_dists = leaf_dists[child];
566  for(ch_it_2 = std::next(ch_it); ch_it_2 != ch_it_end; ++ch_it_2) {
567  child_2 = *ch_it_2; child_leaf_dists_2 = leaf_dists[child_2];
568  for(dist_it = child_leaf_dists->begin(), dist_it_end = child_leaf_dists->end(); dist_it != dist_it_end; ++dist_it) {
569  for(dist_it_2 = child_leaf_dists_2->begin(), dist_it_2_end = child_leaf_dists_2->end(); dist_it_2 != dist_it_2_end; ++dist_it_2) {
570  dm.emplace_back(std::make_pair(std::make_pair(dist_it->first,dist_it_2->first), dist_it->second + dist_it_2->second + get_edge_length(child) + get_edge_length(child_2)));
571  }
572  }
573  }
574  }
575 
576  // calculate leaf distances to this node
577  for(ch_it = children[curr].begin(), ch_it_end = children[curr].end(); ch_it != ch_it_end; ++ch_it) {
578  child = *ch_it; child_leaf_dists = leaf_dists[child];
579  for(dist_it = child_leaf_dists->begin(), dist_it_end = child_leaf_dists->end(); dist_it != dist_it_end; ++dist_it) {
580  curr_leaf_dists->emplace(dist_it->first, dist_it->second + length[child]);
581  }
582  delete child_leaf_dists;
583  }
584  }
585  }
586  delete leaf_dists[ROOT_NODE]; return dm;
587 }
588 
589 // helper function to create new node and add as child to parent
590 CT_NODE_T compact_tree::create_child(const CT_NODE_T parent_node, bool store_labels, bool store_lengths) {
591  tmp_node = parent.size(); parent.emplace_back(parent_node); children.emplace_back(std::vector<CT_NODE_T>());
592  if(parent_node != NULL_NODE) { children[parent_node].emplace_back(tmp_node); }
593  if(store_lengths) { length.emplace_back((CT_LENGTH_T)0); }
594  if(store_labels) { label.emplace_back(""); }
595  return tmp_node;
596 }
597 
598 // extract a subtree
600  compact_tree new_tree; std::stack<std::pair<CT_NODE_T, CT_NODE_T>> to_copy; to_copy.push(std::make_pair(node,0)); // first = old tree; second = new tree
601  bool has_lengths = (length.size() != 0); bool has_labels = (label.size() != 0); new_tree.create_child(NULL_NODE, has_labels, has_lengths);
602  if(has_lengths) { new_tree.length.emplace_back(ZERO_LENGTH); }
603  if(has_labels) { new_tree.label.emplace_back(EMPTY_STRING); }
604  std::pair<CT_NODE_T, CT_NODE_T> curr;
605  while(!to_copy.empty()) {
606  curr = to_copy.top(); to_copy.pop(); std::vector<CT_NODE_T>::iterator it_end = children[curr.first].end();
607  if(has_lengths) { new_tree.length[curr.second] = length[curr.first]; }
608  if(has_labels) { new_tree.label[curr.second] = label[curr.first]; }
609  for(std::vector<CT_NODE_T>::iterator it = children[curr.first].begin(); it != it_end; ++it) {
610  to_copy.push(std::make_pair(*it, new_tree.create_child(curr.second, has_labels, has_lengths)));
611  }
612  }
613  return new_tree;
614 }
615 
616 // compact_tree constructor (putting it last because it's super long)
617 compact_tree::compact_tree(const char* const input, bool is_fn, bool store_labels, bool store_lengths, size_t reserve) {
618  // reserve space up-front (if given `reserve`) to reduce resizing (save time)
619  if(reserve != 0) { parent.reserve(reserve); if(store_lengths) { length.reserve(reserve); } if(store_labels) { label.reserve(reserve); } }
620 
621  // set up file input: https://stackoverflow.com/a/17925143/2134991
622  int fd = -1;
623  size_t bytes_read = 0; size_t i; // variables to help with reading
624  const size_t INPUT_LEN = strlen(input); size_t input_i = 0; // variables to help with reading from Newick string specifically
625  char buf[IO_BUFFER_SIZE + 1]; // buffer for reading
626  char str_buf[STR_BUFFER_SIZE] = {}; size_t str_buf_i = 0; // helper string buffer
627  if(is_fn) {
628  fd = open(input, O_RDONLY);
629  if(fd == -1) {
630  throw std::invalid_argument(ERROR_OPENING_FILE + std::string(": ") + std::string(input));
631  }
632  posix_fadvise(fd, 0, 0, 1); // FDADVICE_SEQUENTIAL
633  }
634 
635  // set up initial Newick parsing state
636  create_child(NULL_NODE, store_labels, store_lengths); // create empty root node
637  CT_NODE_T curr_node = ROOT_NODE; // start traversal at root
638  bool parse_length = false; // parsing a length right now?
639  bool parse_label = false; // parsing a label right now?
640  bool parse_label_single = false; // parsing a single-quote label right now?
641  bool parse_label_double = false; // parsing a double-quote label right now?
642  bool parse_comment = false; // parsing a comment [...] right now?
643 
644  // read Newick tree byte-by-byte
645  while(true) {
646  // either read from file or from Newick C string
647  if(is_fn) {
648  bytes_read = read(fd, buf, IO_BUFFER_SIZE);
649  } else {
650  if(input_i == INPUT_LEN) { // finished parsing string
651  bytes_read = 0;
652  } else {
653  if((input_i + IO_BUFFER_SIZE) <= INPUT_LEN) {
654  bytes_read = IO_BUFFER_SIZE;
655  } else { // fewer than IO_BUFFER_SIZE bytes left in Newick string (input)
656  bytes_read = INPUT_LEN - input_i;
657  }
658  memcpy(buf, input+input_i, bytes_read); input_i += bytes_read;
659  }
660  }
661 
662  // handle done reading (!bytes_read) and read failed (bytes_read == -1)
663  if(!bytes_read || bytes_read == (size_t)(-1)) {
664  throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(": ") + std::string(input));
665  }
666 
667  // iterate over the characters in the buffer
668  for(i = 0; i < bytes_read; ++i) {
669  // currently parsing a comment (ignore for now)
670  if(parse_comment) {
671  // reached end of comment
672  if(buf[i] == ']') {
673  parse_comment = false;
674  }
675  }
676 
677  // currently parsing an edge length
678  else if(parse_length) {
679  switch(buf[i]) {
680  // finished parsing edge length
681  case ',':
682  case ')':
683  case ';':
684  if(store_lengths) { str_buf[str_buf_i] = (char)0; length[curr_node] = PARSE_LENGTH_FROM_C_STR(str_buf); }
685  parse_length = false; --i; break; // need to re-read this character
686 
687  // edge comment (ignore for now)
688  case '[':
689  parse_comment = true; break;
690 
691  // character within edge length
692  default:
693  if(store_lengths) { str_buf[str_buf_i++] = buf[i]; }
694  break;
695  }
696  }
697 
698  // currently parsing a node label
699  else if(parse_label) {
700  // parsing quoted label ('' or ""), so blindly add next char to label
701  if(parse_label_single || parse_label_double) {
702  if(store_labels) { str_buf[str_buf_i++] = buf[i]; }
703  if(buf[i] == '\'' && parse_label_single) {
704  parse_label = false; parse_label_single = false;
705  } else if(buf[i] == '"' && parse_label_double) {
706  parse_label = false; parse_label_double = false;
707  }
708  }
709 
710  // parsing non-quoted label, so check next char for validity before adding
711  else {
712  switch(buf[i]) {
713  case ':': case ',': case ')': case ';':
714  parse_label = false; --i; break; // finished label, so need to re-read this character
715  default:
716  if(store_labels) { str_buf[str_buf_i++] = buf[i]; } break;
717  }
718  }
719 
720  // if we finished parsing the label, finalize it
721  if(!parse_label) {
722  if(store_labels) { str_buf[str_buf_i] = (char)0; label[curr_node] = str_buf; }
723  parse_label = false;
724  }
725  }
726 
727  // all other symbols
728  else {
729  switch(buf[i]) {
730  // ignore spaces outside of labels
731  case ' ':
732  break;
733 
734  // end of Newick string
735  case ';':
736  if(curr_node != (CT_NODE_T)0) {
737  throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(": ") + std::string(input));
738  }
739  return;
740 
741  // go to new child
742  case '(':
743  if(curr_node == NULL_NODE) {
744  throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(": ") + std::string(input));
745  }
746  curr_node = create_child(curr_node, store_labels, store_lengths); break;
747 
748  // go to parent
749  case ')':
750  curr_node = parent[curr_node]; break;
751 
752  // go to new sibling
753  case ',':
754  if((curr_node == NULL_NODE) || (parent[curr_node] == NULL_NODE)) {
755  throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(": ") + std::string(input));
756  }
757  curr_node = create_child(parent[curr_node], store_labels, store_lengths); break;
758 
759  // node comment (ignore for now)
760  case '[':
761  parse_comment = true; break;
762 
763  // edge length
764  case ':':
765  if(store_lengths) { str_buf_i = 0; }
766  parse_length = true; break;
767 
768  // about to parse a node label in single quotes ('')
769  case '\'':
770  if(store_labels) { str_buf_i = 0; }
771  parse_label = true; parse_label_single = true; break;
772 
773  // about to parse a node label in double quotes ("")
774  case '"':
775  if(store_labels) { str_buf_i = 0; }
776  parse_label = true; parse_label_double = true; break;
777 
778  // about to start a node label without quotes
779  default:
780  if(store_labels) { str_buf_i = 0; }
781  parse_label = true; --i; break; // need to re-read this character (it's part of the label)
782  }
783  }
784  }
785  }
786 }
787 #endif
Definition: compact_tree.h:414
Definition: compact_tree.h:384
Definition: compact_tree.h:352
Definition: compact_tree.h:322
Definition: compact_tree.h:292
Definition: compact_tree.h:61
const std::vector< std::string > & get_labels() const
Definition: compact_tree.h:264
CT_NODE_T get_root() const
Definition: compact_tree.h:169
bool has_edge_lengths() const
Definition: compact_tree.h:203
const std::string & get_label(CT_NODE_T node) const
Definition: compact_tree.h:249
CT_NODE_T find_mrca(const std::unordered_set< CT_NODE_T > &nodes) const
Definition: compact_tree.h:520
double calc_dist(CT_NODE_T u, CT_NODE_T v) const
Definition: compact_tree.h:494
postorder_iterator postorder_begin()
Definition: compact_tree.h:339
bool is_leaf(CT_NODE_T node) const
Definition: compact_tree.h:183
void clear_labels(bool shrink=true)
Definition: compact_tree.h:242
const std::vector< CT_LENGTH_T > & get_edge_lengths() const
Definition: compact_tree.h:221
leaves_iterator leaves_end()
Definition: compact_tree.h:408
void set_edge_length(CT_NODE_T node, CT_LENGTH_T new_length)
Definition: compact_tree.h:228
children_iterator children_end(CT_NODE_T node)
Definition: compact_tree.h:440
size_t get_num_leaves()
Definition: compact_tree.h:157
size_t get_num_nodes() const
Definition: compact_tree.h:151
compact_tree(const std::string &input, bool is_fn=true, bool store_labels=true, bool store_lengths=true, size_t reserve=0)
Definition: compact_tree.h:115
std::string get_newick(CT_NODE_T node=ROOT_NODE, bool include_semicolon=true)
Definition: compact_tree.h:145
CT_NODE_T get_parent(CT_NODE_T node) const
Definition: compact_tree.h:190
double calc_total_bl(bool include_internal=true, bool include_leaves=true) const
Definition: compact_tree.h:462
levelorder_iterator levelorder_begin()
Definition: compact_tree.h:372
preorder_iterator preorder_end()
Definition: compact_tree.h:316
bool is_root(CT_NODE_T node) const
Definition: compact_tree.h:176
CT_LENGTH_T get_edge_length(CT_NODE_T node) const
Definition: compact_tree.h:216
preorder_iterator preorder_begin()
Definition: compact_tree.h:310
void replace_labels(const std::unordered_map< std::string, std::string > &label_map, bool include_internal=true)
Definition: compact_tree.h:271
double calc_avg_bl(bool include_internal=true, bool include_leaves=true) const
Definition: compact_tree.h:478
levelorder_iterator levelorder_end()
Definition: compact_tree.h:378
void set_label(CT_NODE_T node, const std::string &new_label)
Definition: compact_tree.h:256
void print_newick(std::ostream &out, CT_NODE_T node=ROOT_NODE, bool include_semicolon=true)
Definition: compact_tree.h:510
bool has_labels() const
Definition: compact_tree.h:236
postorder_iterator postorder_end()
Definition: compact_tree.h:345
std::tuple< const std::string *, CT_LENGTH_T, CT_NODE_T, const std::vector< CT_NODE_T > * > operator[](CT_NODE_T i) const
Definition: compact_tree.h:286
compact_tree(const compact_tree &o)
Definition: compact_tree.h:121
children_iterator children_begin(CT_NODE_T node)
Definition: compact_tree.h:433
size_t get_num_internal()
Definition: compact_tree.h:163
const std::vector< CT_NODE_T > & get_children(CT_NODE_T node) const
Definition: compact_tree.h:197
std::vector< std::pair< std::pair< CT_NODE_T, CT_NODE_T >, double > > calc_distance_matrix()
Definition: compact_tree.h:540
leaves_iterator leaves_begin()
Definition: compact_tree.h:402
CT_NODE_T add_child(const CT_NODE_T parent_node, const std::string &new_label=EMPTY_STRING, CT_LENGTH_T new_length=ZERO_LENGTH)
Definition: compact_tree.h:130
compact_tree extract_subtree(CT_NODE_T node)
Definition: compact_tree.h:599
void clear_edge_lengths(bool shrink=true)
Definition: compact_tree.h:209