CompactTree 1.0.0
Loading...
Searching...
No Matches
compact_tree.h
1
5#ifndef COMPACT_TREE_H
6#define COMPACT_TREE_H
7
8// include statements
9#include <cstdint> // std::uint32_t, std::uint64_t
10#include <cstdlib> // std::abs, std::atof
11#include <cstring> // strcmp()
12#include <fcntl.h> // O_RDONLY, open(), posix_fadvise()
13#include <iostream> // std::ostream
14#include <limits> // std::numeric_limits<T>::max
15#include <queue> // std::queue
16#include <sstream> // std::ostringstream
17#include <stack> // std::stack
18#include <stdexcept> // std::invalid_argument
19#include <string> // std::string
20#include <tuple> // std::tuple
21#include <unistd.h> // read()
22#include <unordered_map> // std::unordered_map
23#include <unordered_set> // std::unordered_set
24#include <utility> // std::make_pair, std::pair
25#include <vector> // std::vector
26
27// define node type, which is a fixed-width unsigned integer (default is 32-bit)
28#if defined CT_NODE_64
29#define CT_NODE_T std::uint64_t
30#else
31#define CT_NODE_T std::uint32_t
32#endif
33
34// define edge length type, which is a floating point number (default is float)
35#if defined CT_LENGTH_DOUBLE
36#define CT_LENGTH_T double
37#define PARSE_LENGTH_FROM_C_STR std::atof
38#define PARSE_LENGTH_FROM_STRING std::stod
39#else
40#define CT_LENGTH_T float
41#define PARSE_LENGTH_FROM_C_STR (float)std::atof
42#define PARSE_LENGTH_FROM_STRING std::stof
43#endif
44
45// general constants
46#define COMPACTTREE_VERSION "1.0.0"
47#define IO_BUFFER_SIZE 16384
48#define STR_BUFFER_SIZE 16384
49const std::string EMPTY_STRING = "";
50const CT_NODE_T ROOT_NODE = (CT_NODE_T)0;
51const CT_NODE_T NULL_NODE = std::numeric_limits<CT_NODE_T>::max();
52const CT_LENGTH_T ZERO_LENGTH = (CT_LENGTH_T)0.;
53const CT_LENGTH_T ZERO_THRESH = (CT_LENGTH_T)0.000001;
54
55// error messages
56const std::string ERROR_OPENING_FILE = "Error opening file";
57const std::string ERROR_INVALID_NEWICK_FILE = "Invalid Newick file";
58const std::string ERROR_INVALID_NEWICK_STRING = "Invalid Newick string";
59
60// compact_tree class
62 private:
66 std::vector<CT_NODE_T> parent; // `parent[i]` is the parent of node `i`
67 std::vector<std::vector<CT_NODE_T>> children; // `children[i]` is a `vector` containing the children of node `i`
68 std::vector<std::string> label; // `label[i]` is the label of node `i`
69 std::vector<CT_LENGTH_T> length; // `length[i]` is the length of the edge incident to (i.e., going into) node `i`
70
74 size_t num_leaves = 0; // cache the total number of leaves (avoid recalculation)
75 CT_NODE_T tmp_node; // temporary holding variable for nodes (e.g. new child); value should only be used immediately after assigning
76
80 compact_tree() {}
81
89 CT_NODE_T create_child(const CT_NODE_T parent_node, bool store_labels, bool store_lengths);
90
94 void calc_num_leaves() { CT_NODE_T num_nodes = parent.size(); for(CT_NODE_T curr = 0; curr < num_nodes; ++curr) { if(is_leaf(curr)) { num_leaves += 1; } } }
95
96 public:
105 compact_tree(const char* const input, bool is_fn = true, bool store_labels = true, bool store_lengths = true, size_t reserve = 0);
106
115 compact_tree(const std::string & input, bool is_fn = true, bool store_labels = true, bool store_lengths = true, size_t reserve = 0) : compact_tree(&input[0], is_fn, store_labels, store_lengths, reserve) {}
116
121 compact_tree(const compact_tree & o) : parent(o.parent), children(o.children), label(o.label), length(o.length), num_leaves(o.num_leaves) {}
122
130 CT_NODE_T add_child(const CT_NODE_T parent_node, const std::string & new_label = EMPTY_STRING, CT_LENGTH_T new_length = ZERO_LENGTH) { if((num_leaves != 0) && !is_leaf(parent_node)) { ++num_leaves; } CT_NODE_T x = create_child(parent_node,has_labels(),has_edge_lengths()); if(!(new_label.empty())) { set_label(x,new_label); } if(std::abs(new_length) > ZERO_THRESH) { set_edge_length(x,new_length); } return x; }
131
138 void print_newick(std::ostream & out, CT_NODE_T node = ROOT_NODE, bool include_semicolon = true);
139
145 std::string get_newick(CT_NODE_T node = ROOT_NODE, bool include_semicolon = true) { std::ostringstream oss; print_newick(oss,node,include_semicolon); return oss.str(); }
146
151 size_t get_num_nodes() const { return parent.size(); }
152
157 size_t get_num_leaves() { if(num_leaves == 0) { calc_num_leaves(); } return num_leaves; }
158
164
169 CT_NODE_T get_root() const { return ROOT_NODE; }
170
176 bool is_root(CT_NODE_T node) const { return node == ROOT_NODE; }
177
183 bool is_leaf(CT_NODE_T node) const { return children[node].size() == 0; }
184
190 CT_NODE_T get_parent(CT_NODE_T node) const { return parent[node]; }
191
197 const std::vector<CT_NODE_T> & get_children(CT_NODE_T node) const { return children[node]; }
198
203 bool has_edge_lengths() const { return length.size() != 0; }
204
209 void clear_edge_lengths(bool shrink = true) { if(!length.empty()) { length.clear(); if(shrink) { length.shrink_to_fit(); } } }
210
216 CT_LENGTH_T get_edge_length(CT_NODE_T node) const { return has_edge_lengths() ? length[node] : (CT_LENGTH_T)0.; }
217
221 const std::vector<CT_LENGTH_T> & get_edge_lengths() const { return length; }
222
228 void set_edge_length(CT_NODE_T node, CT_LENGTH_T new_length) {
229 if(!has_edge_lengths()) { length = std::vector<CT_LENGTH_T>(get_num_nodes(), (CT_LENGTH_T)0.); } length[node] = new_length;
230 }
231
236 bool has_labels() const { return label.size() != 0; }
237
242 void clear_labels(bool shrink = true) { if(!label.empty()) { label.clear(); if(shrink) { label.shrink_to_fit(); } } }
243
249 const std::string & get_label(CT_NODE_T node) const { return has_labels() ? label[node] : EMPTY_STRING; }
250
256 void set_label(CT_NODE_T node, const std::string & new_label) {
257 if(!has_labels()) { label = std::vector<std::string>(get_num_nodes(), ""); } label[node] = new_label;
258 }
259
264 const std::vector<std::string> & get_labels() const { return label; }
265
271 void replace_labels(const std::unordered_map<std::string, std::string> & label_map, bool include_internal = true) {
272 const size_t NUM_NODES = get_num_nodes(); CT_NODE_T curr; std::unordered_map<std::string,std::string>::const_iterator it; std::unordered_map<std::string,std::string>::const_iterator it_end = label_map.cend();
273 for(curr = ROOT_NODE; curr < NUM_NODES; ++curr) {
274 if(is_leaf(curr) || include_internal) {
275 it = label_map.find(label[curr]);
276 if(it != it_end) { label[curr] = it->second; }
277 }
278 }
279 }
280
286 std::tuple<const std::string*, CT_LENGTH_T, CT_NODE_T, const std::vector<CT_NODE_T>*> operator[](CT_NODE_T i) const { return std::make_tuple(&get_label(i), get_edge_length(i), parent[i], &children[i]); }
287
292 class preorder_iterator : public std::iterator<std::input_iterator_tag, CT_NODE_T> {
293 private:
294 CT_NODE_T node;
295 public:
296 preorder_iterator(CT_NODE_T x) : node(x) {}
297 preorder_iterator(const preorder_iterator & it) : node(it.node) {}
298 preorder_iterator & operator=(const preorder_iterator & it) { node = it.node; return *this; }
299 preorder_iterator & operator++() { ++node; return *this; }
300 preorder_iterator operator++(int) { preorder_iterator tmp(*this); operator++(); return tmp; }
301 bool operator==(const preorder_iterator & rhs) const { return node == rhs.node; }
302 bool operator!=(const preorder_iterator & rhs) const { return node != rhs.node; }
303 CT_NODE_T operator*() { return node; }
304 };
305
311
317
322 class postorder_iterator : public std::iterator<std::input_iterator_tag, CT_NODE_T> {
323 private:
324 CT_NODE_T node;
325 public:
326 postorder_iterator(CT_NODE_T x) : node(x) {}
327 postorder_iterator(const postorder_iterator & it) : node(it.node) {}
328 postorder_iterator & operator=(const postorder_iterator & it) { node = it.node; return *this; }
329 postorder_iterator & operator++() { --node; return *this; }
330 postorder_iterator operator++(int) { postorder_iterator tmp(*this); operator++(); return tmp; }
331 bool operator==(const postorder_iterator & rhs) const { return node == rhs.node; }
332 bool operator!=(const postorder_iterator & rhs) const { return node != rhs.node; }
333 CT_NODE_T operator*() { return node; }
334 };
340
346
352 class levelorder_iterator : public std::iterator<std::input_iterator_tag, CT_NODE_T> {
353 private:
354 std::queue<CT_NODE_T> q; compact_tree* tree_ptr;
355 public:
356 levelorder_iterator(compact_tree* const tp) : tree_ptr(tp) {}
357 levelorder_iterator(const CT_NODE_T root, compact_tree* const tp) : tree_ptr(tp) { q.push(root); }
358 levelorder_iterator(std::queue<CT_NODE_T> x, compact_tree* const tp) : q(x), tree_ptr(tp) {}
359 levelorder_iterator(const levelorder_iterator & o) : q(o.q), tree_ptr(o.tree_ptr) {}
360 levelorder_iterator & operator=(const levelorder_iterator & o) { q = o.q; tree_ptr = o.tree_ptr; return *this; }
361 levelorder_iterator & operator++() { CT_NODE_T x = q.front(); std::vector<CT_NODE_T>::iterator it_end = tree_ptr->children[x].end(); for(std::vector<CT_NODE_T>::iterator it = tree_ptr->children[x].begin(); it != it_end; ++it) { q.push(*it); } q.pop(); return *this; }
362 levelorder_iterator operator++(int) { levelorder_iterator tmp(*this); operator++(); return tmp; }
363 bool operator==(const levelorder_iterator & rhs) const { return q == rhs.q; }
364 bool operator!=(const levelorder_iterator & rhs) const { return q != rhs.q; }
365 CT_NODE_T operator*() { return q.front(); }
366 };
367
373
379
384 class leaves_iterator : public std::iterator<std::input_iterator_tag, CT_NODE_T> {
385 private:
386 CT_NODE_T node; const compact_tree* tree_ptr;
387 public:
388 leaves_iterator(CT_NODE_T x, const compact_tree* const tp) : node(x), tree_ptr(tp) {}
389 leaves_iterator(const leaves_iterator & it) : node(it.node), tree_ptr(it.tree_ptr) {}
390 leaves_iterator & operator=(const leaves_iterator & it) { node = it.node; tree_ptr = it.tree_ptr; return *this; }
391 leaves_iterator & operator++() { CT_NODE_T num_nodes = tree_ptr->get_num_nodes(); while(((++node) < num_nodes) && !(tree_ptr->is_leaf(node))) {} return *this; }
392 leaves_iterator operator++(int) { leaves_iterator tmp(*this); operator++(); return tmp; }
393 bool operator==(const leaves_iterator & rhs) const { return node == rhs.node; }
394 bool operator!=(const leaves_iterator & rhs) const { return node != rhs.node; }
395 CT_NODE_T operator*() { return node; }
396 };
397
402 leaves_iterator leaves_begin() { CT_NODE_T node = ROOT_NODE; CT_NODE_T num_nodes = get_num_nodes(); while(((++node) < num_nodes) && (!is_leaf(node))) {} return leaves_iterator(node, this); }
403
408 leaves_iterator leaves_end() { return leaves_iterator((CT_NODE_T)get_num_nodes(), this); }
409
414 class children_iterator : public std::iterator<std::input_iterator_tag, CT_NODE_T> {
415 private:
416 std::vector<CT_NODE_T>::iterator it;
417 public:
418 children_iterator(std::vector<CT_NODE_T>::iterator x) : it(x) {}
419 children_iterator(const children_iterator & o) : it(o.it) {}
420 children_iterator & operator=(const children_iterator & o) { it = o.it; return *this; }
421 children_iterator & operator++() { ++it; return *this; }
422 children_iterator operator++(int) { children_iterator tmp(*this); operator++(); return tmp; }
423 bool operator==(const children_iterator & rhs) const { return it == rhs.it; }
424 bool operator!=(const children_iterator & rhs) const { return it != rhs.it; }
425 CT_NODE_T operator*() { return *it; }
426 };
427
433 children_iterator children_begin(CT_NODE_T node) { return children_iterator(children[node].begin()); }
434
440 children_iterator children_end(CT_NODE_T node) { return children_iterator(children[node].end()); }
441
447 CT_NODE_T find_mrca(const std::unordered_set<CT_NODE_T> & nodes) const;
448
454 compact_tree extract_subtree(CT_NODE_T node);
455
462 double calc_total_bl(bool include_internal = true, bool include_leaves = true) const {
463 if(!(include_internal || include_leaves)) { return 0.; }
464 double tot = 0; CT_NODE_T num_nodes = get_num_nodes();
465 for(CT_NODE_T node = 0; node < num_nodes; ++node) {
466 if(is_leaf(node)) { if(include_leaves) { tot += get_edge_length(node); } }
467 else { if(include_internal) { tot += get_edge_length(node); } }
468 }
469 return tot;
470 }
471
478 double calc_avg_bl(bool include_internal = true, bool include_leaves = true) const {
479 if(!(include_internal || include_leaves)) { return 0.; }
480 double tot = 0; CT_NODE_T num_nodes = get_num_nodes(); CT_NODE_T count = 0;
481 for(CT_NODE_T node = 0; node < num_nodes; ++node) {
482 if(is_leaf(node)) { if(include_leaves) { tot += get_edge_length(node); ++count; } }
483 else { if(include_internal) { tot += get_edge_length(node); ++count; } }
484 }
485 return tot / count;
486 }
487
494 double calc_dist(CT_NODE_T u, CT_NODE_T v) const {
495 if(u == v) { return 0.; } if(u == parent[v]) { return get_edge_length(v); } if(v == parent[u]) { return get_edge_length(u); }
496 std::unordered_map<CT_NODE_T, double> u_dists; std::unordered_map<CT_NODE_T, double> v_dists; CT_NODE_T c = u; CT_NODE_T p = parent[c]; std::unordered_map<CT_NODE_T, double>::iterator it;
497 while(p != NULL_NODE) { u_dists[p] = u_dists[c] + get_edge_length(c); c = p; p = parent[p]; } c = v; p = parent[c];
498 while(p != NULL_NODE) { v_dists[p] = v_dists[c] + get_edge_length(c); it = u_dists.find(p); if(it != u_dists.end()) { return it->second + v_dists[p]; } c = p; p = parent[p]; }
499 return -1.; // error, shouldn't reach here
500 }
501
506 std::vector<std::pair<std::pair<CT_NODE_T,CT_NODE_T>, double>> calc_distance_matrix();
507};
508
509// print Newick string (currently recursive)
510void compact_tree::print_newick(std::ostream & out, CT_NODE_T node, bool include_semicolon) {
511 std::vector<CT_NODE_T>::iterator it_begin = children[node].begin(); std::vector<CT_NODE_T>::iterator it_end = children[node].end();
512 for(std::vector<CT_NODE_T>::iterator it = it_begin; it != it_end; ++it) { out << ((it == it_begin) ? '(' : ','); print_newick(out, *it, false); }
513 if(!is_leaf(node)) { out << ')'; }
514 if(label.size() != 0) { out << label[node]; }
515 if(length.size() != 0) { out << ':' << length[node]; }
516 if(include_semicolon) { out << ';'; }
517}
518
519// find the MRCA of nodes
520CT_NODE_T compact_tree::find_mrca(const std::unordered_set<CT_NODE_T> & nodes) const {
521 std::queue<CT_NODE_T, std::deque<CT_NODE_T>> to_visit(std::deque<CT_NODE_T>(nodes.begin(), nodes.end()));
522 std::unordered_map<CT_NODE_T, CT_NODE_T> count; std::unordered_map<CT_NODE_T, CT_NODE_T>::iterator count_it;
523 CT_NODE_T curr_node; CT_NODE_T curr_parent; size_t total = nodes.size();
524 while(!to_visit.empty()) {
525 curr_node = to_visit.front(); to_visit.pop(); count_it = count.find(curr_node); curr_parent = parent[curr_node];
526 if(count_it == count.end()) { // current node hasn't been seen before, initialize its count to 0
527 count_it = count.emplace(curr_node, 0).first;
528 }
529 if((++(count_it->second)) == total) {
530 return curr_node;
531 }
532 if(curr_parent != NULL_NODE) {
533 to_visit.emplace(curr_parent);
534 }
535 }
536 return NULL_NODE; // shouldn't ever reach here
537}
538
539// calculate distance matrix
540std::vector<std::pair<std::pair<CT_NODE_T,CT_NODE_T>, double>> compact_tree::calc_distance_matrix() {
541 // set things up
542 const size_t NUM_NODES = get_num_nodes(); const size_t N = get_num_leaves(); const size_t N_MINUS_1 = N - 1;
543 const size_t N_CHOOSE_2 = ((N%2) == 0) ? ((N/2) * N_MINUS_1) : ((N_MINUS_1/2) * N);
544 std::vector<std::pair<std::pair<CT_NODE_T,CT_NODE_T>, double>> dm; dm.reserve(N_CHOOSE_2);
545 std::unordered_map<CT_NODE_T, std::unordered_map<CT_NODE_T, double>*> leaf_dists;
546 std::vector<CT_NODE_T>::iterator ch_it; std::vector<CT_NODE_T>::iterator ch_it_2; std::vector<CT_NODE_T>::iterator ch_it_end;
547 CT_NODE_T curr; CT_NODE_T child; CT_NODE_T child_2; std::unordered_map<CT_NODE_T, double>* curr_leaf_dists;
548 std::unordered_map<CT_NODE_T, double>* child_leaf_dists; std::unordered_map<CT_NODE_T, double>::iterator dist_it; std::unordered_map<CT_NODE_T, double>::iterator dist_it_end;
549 std::unordered_map<CT_NODE_T, double>* child_leaf_dists_2; std::unordered_map<CT_NODE_T, double>::iterator dist_it_2; std::unordered_map<CT_NODE_T, double>::iterator dist_it_2_end;
550
551 // calculate pairwise distances
552 for(curr = NUM_NODES-1; curr != NULL_NODE; --curr) {
553 curr_leaf_dists = new std::unordered_map<CT_NODE_T, double>();
554 leaf_dists.emplace(curr, curr_leaf_dists);
555
556 // for leaves, they have 0 distance to themselves
557 if(is_leaf(curr)) {
558 curr_leaf_dists->emplace(curr, (double)0.);
559 }
560
561 // for internal nodes:
562 else {
563 // calculate all pairwise distances between leaves below this node
564 for(ch_it = children[curr].begin(), ch_it_end = children[curr].end(); std::next(ch_it) != ch_it_end; ++ch_it) {
565 child = *ch_it; child_leaf_dists = leaf_dists[child];
566 for(ch_it_2 = std::next(ch_it); ch_it_2 != ch_it_end; ++ch_it_2) {
567 child_2 = *ch_it_2; child_leaf_dists_2 = leaf_dists[child_2];
568 for(dist_it = child_leaf_dists->begin(), dist_it_end = child_leaf_dists->end(); dist_it != dist_it_end; ++dist_it) {
569 for(dist_it_2 = child_leaf_dists_2->begin(), dist_it_2_end = child_leaf_dists_2->end(); dist_it_2 != dist_it_2_end; ++dist_it_2) {
570 dm.emplace_back(std::make_pair(std::make_pair(dist_it->first,dist_it_2->first), dist_it->second + dist_it_2->second + get_edge_length(child) + get_edge_length(child_2)));
571 }
572 }
573 }
574 }
575
576 // calculate leaf distances to this node
577 for(ch_it = children[curr].begin(), ch_it_end = children[curr].end(); ch_it != ch_it_end; ++ch_it) {
578 child = *ch_it; child_leaf_dists = leaf_dists[child];
579 for(dist_it = child_leaf_dists->begin(), dist_it_end = child_leaf_dists->end(); dist_it != dist_it_end; ++dist_it) {
580 curr_leaf_dists->emplace(dist_it->first, dist_it->second + length[child]);
581 }
582 delete child_leaf_dists;
583 }
584 }
585 }
586 delete leaf_dists[ROOT_NODE]; return dm;
587}
588
589// helper function to create new node and add as child to parent
590CT_NODE_T compact_tree::create_child(const CT_NODE_T parent_node, bool store_labels, bool store_lengths) {
591 tmp_node = parent.size(); parent.emplace_back(parent_node); children.emplace_back(std::vector<CT_NODE_T>());
592 if(parent_node != NULL_NODE) { children[parent_node].emplace_back(tmp_node); }
593 if(store_lengths) { length.emplace_back((CT_LENGTH_T)0); }
594 if(store_labels) { label.emplace_back(""); }
595 return tmp_node;
596}
597
598// extract a subtree
600 compact_tree new_tree; std::stack<std::pair<CT_NODE_T, CT_NODE_T>> to_copy; to_copy.push(std::make_pair(node,0)); // first = old tree; second = new tree
601 bool has_lengths = (length.size() != 0); bool has_labels = (label.size() != 0); new_tree.create_child(NULL_NODE, has_labels, has_lengths);
602 if(has_lengths) { new_tree.length.emplace_back(ZERO_LENGTH); }
603 if(has_labels) { new_tree.label.emplace_back(EMPTY_STRING); }
604 std::pair<CT_NODE_T, CT_NODE_T> curr;
605 while(!to_copy.empty()) {
606 curr = to_copy.top(); to_copy.pop(); std::vector<CT_NODE_T>::iterator it_end = children[curr.first].end();
607 if(has_lengths) { new_tree.length[curr.second] = length[curr.first]; }
608 if(has_labels) { new_tree.label[curr.second] = label[curr.first]; }
609 for(std::vector<CT_NODE_T>::iterator it = children[curr.first].begin(); it != it_end; ++it) {
610 to_copy.push(std::make_pair(*it, new_tree.create_child(curr.second, has_labels, has_lengths)));
611 }
612 }
613 return new_tree;
614}
615
616// compact_tree constructor (putting it last because it's super long)
617compact_tree::compact_tree(const char* const input, bool is_fn, bool store_labels, bool store_lengths, size_t reserve) {
618 // reserve space up-front (if given `reserve`) to reduce resizing (save time)
619 if(reserve != 0) { parent.reserve(reserve); if(store_lengths) { length.reserve(reserve); } if(store_labels) { label.reserve(reserve); } }
620
621 // set up file input: https://stackoverflow.com/a/17925143/2134991
622 int fd = -1;
623 size_t bytes_read = 0; size_t i; // variables to help with reading
624 const size_t INPUT_LEN = strlen(input); size_t input_i = 0; // variables to help with reading from Newick string specifically
625 char buf[IO_BUFFER_SIZE + 1]; // buffer for reading
626 char str_buf[STR_BUFFER_SIZE] = {}; size_t str_buf_i = 0; // helper string buffer
627 if(is_fn) {
628 fd = open(input, O_RDONLY);
629 if(fd == -1) {
630 throw std::invalid_argument(ERROR_OPENING_FILE + std::string(": ") + std::string(input));
631 }
632 posix_fadvise(fd, 0, 0, 1); // FDADVICE_SEQUENTIAL
633 }
634
635 // set up initial Newick parsing state
636 create_child(NULL_NODE, store_labels, store_lengths); // create empty root node
637 CT_NODE_T curr_node = ROOT_NODE; // start traversal at root
638 bool parse_length = false; // parsing a length right now?
639 bool parse_label = false; // parsing a label right now?
640 bool parse_label_single = false; // parsing a single-quote label right now?
641 bool parse_label_double = false; // parsing a double-quote label right now?
642 bool parse_comment = false; // parsing a comment [...] right now?
643
644 // read Newick tree byte-by-byte
645 while(true) {
646 // either read from file or from Newick C string
647 if(is_fn) {
648 bytes_read = read(fd, buf, IO_BUFFER_SIZE);
649 } else {
650 if(input_i == INPUT_LEN) { // finished parsing string
651 bytes_read = 0;
652 } else {
653 if((input_i + IO_BUFFER_SIZE) <= INPUT_LEN) {
654 bytes_read = IO_BUFFER_SIZE;
655 } else { // fewer than IO_BUFFER_SIZE bytes left in Newick string (input)
656 bytes_read = INPUT_LEN - input_i;
657 }
658 memcpy(buf, input+input_i, bytes_read); input_i += bytes_read;
659 }
660 }
661
662 // handle done reading (!bytes_read) and read failed (bytes_read == -1)
663 if(!bytes_read || bytes_read == (size_t)(-1)) {
664 throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(": ") + std::string(input));
665 }
666
667 // iterate over the characters in the buffer
668 for(i = 0; i < bytes_read; ++i) {
669 // currently parsing a comment (ignore for now)
670 if(parse_comment) {
671 // reached end of comment
672 if(buf[i] == ']') {
673 parse_comment = false;
674 }
675 }
676
677 // currently parsing an edge length
678 else if(parse_length) {
679 switch(buf[i]) {
680 // finished parsing edge length
681 case ',':
682 case ')':
683 case ';':
684 if(store_lengths) { str_buf[str_buf_i] = (char)0; length[curr_node] = PARSE_LENGTH_FROM_C_STR(str_buf); }
685 parse_length = false; --i; break; // need to re-read this character
686
687 // edge comment (ignore for now)
688 case '[':
689 parse_comment = true; break;
690
691 // character within edge length
692 default:
693 if(store_lengths) { str_buf[str_buf_i++] = buf[i]; }
694 break;
695 }
696 }
697
698 // currently parsing a node label
699 else if(parse_label) {
700 // parsing quoted label ('' or ""), so blindly add next char to label
701 if(parse_label_single || parse_label_double) {
702 if(store_labels) { str_buf[str_buf_i++] = buf[i]; }
703 if(buf[i] == '\'' && parse_label_single) {
704 parse_label = false; parse_label_single = false;
705 } else if(buf[i] == '"' && parse_label_double) {
706 parse_label = false; parse_label_double = false;
707 }
708 }
709
710 // parsing non-quoted label, so check next char for validity before adding
711 else {
712 switch(buf[i]) {
713 case ':': case ',': case ')': case ';':
714 parse_label = false; --i; break; // finished label, so need to re-read this character
715 default:
716 if(store_labels) { str_buf[str_buf_i++] = buf[i]; } break;
717 }
718 }
719
720 // if we finished parsing the label, finalize it
721 if(!parse_label) {
722 if(store_labels) { str_buf[str_buf_i] = (char)0; label[curr_node] = str_buf; }
723 parse_label = false;
724 }
725 }
726
727 // all other symbols
728 else {
729 switch(buf[i]) {
730 // ignore spaces outside of labels
731 case ' ':
732 break;
733
734 // end of Newick string
735 case ';':
736 if(curr_node != (CT_NODE_T)0) {
737 throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(": ") + std::string(input));
738 }
739 return;
740
741 // go to new child
742 case '(':
743 if(curr_node == NULL_NODE) {
744 throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(": ") + std::string(input));
745 }
746 curr_node = create_child(curr_node, store_labels, store_lengths); break;
747
748 // go to parent
749 case ')':
750 curr_node = parent[curr_node]; break;
751
752 // go to new sibling
753 case ',':
754 if((curr_node == NULL_NODE) || (parent[curr_node] == NULL_NODE)) {
755 throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(": ") + std::string(input));
756 }
757 curr_node = create_child(parent[curr_node], store_labels, store_lengths); break;
758
759 // node comment (ignore for now)
760 case '[':
761 parse_comment = true; break;
762
763 // edge length
764 case ':':
765 if(store_lengths) { str_buf_i = 0; }
766 parse_length = true; break;
767
768 // about to parse a node label in single quotes ('')
769 case '\'':
770 if(store_labels) { str_buf_i = 0; }
771 parse_label = true; parse_label_single = true; break;
772
773 // about to parse a node label in double quotes ("")
774 case '"':
775 if(store_labels) { str_buf_i = 0; }
776 parse_label = true; parse_label_double = true; break;
777
778 // about to start a node label without quotes
779 default:
780 if(store_labels) { str_buf_i = 0; }
781 parse_label = true; --i; break; // need to re-read this character (it's part of the label)
782 }
783 }
784 }
785 }
786}
787#endif
Definition compact_tree.h:414
Definition compact_tree.h:384
Definition compact_tree.h:352
Definition compact_tree.h:322
Definition compact_tree.h:292
Definition compact_tree.h:61
CT_NODE_T get_root() const
Definition compact_tree.h:169
bool has_edge_lengths() const
Definition compact_tree.h:203
const std::vector< std::string > & get_labels() const
Definition compact_tree.h:264
CT_NODE_T find_mrca(const std::unordered_set< CT_NODE_T > &nodes) const
Definition compact_tree.h:520
double calc_dist(CT_NODE_T u, CT_NODE_T v) const
Definition compact_tree.h:494
postorder_iterator postorder_begin()
Definition compact_tree.h:339
bool is_leaf(CT_NODE_T node) const
Definition compact_tree.h:183
void clear_labels(bool shrink=true)
Definition compact_tree.h:242
leaves_iterator leaves_end()
Definition compact_tree.h:408
void set_edge_length(CT_NODE_T node, CT_LENGTH_T new_length)
Definition compact_tree.h:228
children_iterator children_end(CT_NODE_T node)
Definition compact_tree.h:440
const std::vector< CT_NODE_T > & get_children(CT_NODE_T node) const
Definition compact_tree.h:197
const std::vector< CT_LENGTH_T > & get_edge_lengths() const
Definition compact_tree.h:221
size_t get_num_leaves()
Definition compact_tree.h:157
size_t get_num_nodes() const
Definition compact_tree.h:151
compact_tree(const std::string &input, bool is_fn=true, bool store_labels=true, bool store_lengths=true, size_t reserve=0)
Definition compact_tree.h:115
std::string get_newick(CT_NODE_T node=ROOT_NODE, bool include_semicolon=true)
Definition compact_tree.h:145
CT_NODE_T get_parent(CT_NODE_T node) const
Definition compact_tree.h:190
double calc_total_bl(bool include_internal=true, bool include_leaves=true) const
Definition compact_tree.h:462
levelorder_iterator levelorder_begin()
Definition compact_tree.h:372
preorder_iterator preorder_end()
Definition compact_tree.h:316
bool is_root(CT_NODE_T node) const
Definition compact_tree.h:176
CT_LENGTH_T get_edge_length(CT_NODE_T node) const
Definition compact_tree.h:216
preorder_iterator preorder_begin()
Definition compact_tree.h:310
void replace_labels(const std::unordered_map< std::string, std::string > &label_map, bool include_internal=true)
Definition compact_tree.h:271
const std::string & get_label(CT_NODE_T node) const
Definition compact_tree.h:249
double calc_avg_bl(bool include_internal=true, bool include_leaves=true) const
Definition compact_tree.h:478
levelorder_iterator levelorder_end()
Definition compact_tree.h:378
void set_label(CT_NODE_T node, const std::string &new_label)
Definition compact_tree.h:256
void print_newick(std::ostream &out, CT_NODE_T node=ROOT_NODE, bool include_semicolon=true)
Definition compact_tree.h:510
bool has_labels() const
Definition compact_tree.h:236
postorder_iterator postorder_end()
Definition compact_tree.h:345
compact_tree(const compact_tree &o)
Definition compact_tree.h:121
children_iterator children_begin(CT_NODE_T node)
Definition compact_tree.h:433
size_t get_num_internal()
Definition compact_tree.h:163
std::vector< std::pair< std::pair< CT_NODE_T, CT_NODE_T >, double > > calc_distance_matrix()
Definition compact_tree.h:540
leaves_iterator leaves_begin()
Definition compact_tree.h:402
CT_NODE_T add_child(const CT_NODE_T parent_node, const std::string &new_label=EMPTY_STRING, CT_LENGTH_T new_length=ZERO_LENGTH)
Definition compact_tree.h:130
compact_tree extract_subtree(CT_NODE_T node)
Definition compact_tree.h:599
void clear_edge_lengths(bool shrink=true)
Definition compact_tree.h:209
std::tuple< const std::string *, CT_LENGTH_T, CT_NODE_T, const std::vector< CT_NODE_T > * > operator[](CT_NODE_T i) const
Definition compact_tree.h:286