22 #include <unordered_map>
23 #include <unordered_set>
28 #if defined CT_NODE_64
29 #define CT_NODE_T std::uint64_t
31 #define CT_NODE_T std::uint32_t
35 #if defined CT_LENGTH_DOUBLE
36 #define CT_LENGTH_T double
37 #define PARSE_LENGTH_FROM_C_STR std::atof
38 #define PARSE_LENGTH_FROM_STRING std::stod
40 #define CT_LENGTH_T float
41 #define PARSE_LENGTH_FROM_C_STR (float)std::atof
42 #define PARSE_LENGTH_FROM_STRING std::stof
46 #define COMPACTTREE_VERSION "0.0.7"
47 #define IO_BUFFER_SIZE 16384
48 #define STR_BUFFER_SIZE 16384
49 const std::string EMPTY_STRING =
"";
50 const CT_NODE_T ROOT_NODE = (CT_NODE_T)0;
51 const CT_NODE_T NULL_NODE = (CT_NODE_T)(-1);
52 const CT_LENGTH_T ZERO_LENGTH = (CT_LENGTH_T)0.;
53 const CT_LENGTH_T ZERO_THRESH = (CT_LENGTH_T)0.000001;
56 const std::string ERROR_OPENING_FILE =
"Error opening file";
57 const std::string ERROR_INVALID_NEWICK_FILE =
"Invalid Newick file";
58 const std::string ERROR_INVALID_NEWICK_STRING =
"Invalid Newick string";
66 std::vector<CT_NODE_T> parent;
67 std::vector<std::vector<CT_NODE_T>> children;
68 std::vector<std::string> label;
69 std::vector<CT_LENGTH_T> length;
74 size_t num_leaves = 0;
89 CT_NODE_T create_child(
const CT_NODE_T parent_node,
bool store_labels,
bool store_lengths);
94 void calc_num_leaves() { CT_NODE_T num_nodes = parent.size();
for(CT_NODE_T curr = 0; curr < num_nodes; ++curr) {
if(
is_leaf(curr)) { num_leaves += 1; } } }
105 compact_tree(
const char*
const input,
bool is_fn =
true,
bool store_labels =
true,
bool store_lengths =
true,
size_t reserve = 0);
115 compact_tree(
const std::string & input,
bool is_fn =
true,
bool store_labels =
true,
bool store_lengths =
true,
size_t reserve = 0) :
compact_tree(&input[0], is_fn, store_labels, store_lengths, reserve) {}
121 compact_tree(
const compact_tree & o) : parent(o.parent), children(o.children), label(o.label), length(o.length), num_leaves(o.num_leaves) {}
130 CT_NODE_T
add_child(
const CT_NODE_T parent_node,
const std::string & new_label = EMPTY_STRING, CT_LENGTH_T new_length = ZERO_LENGTH) {
if((num_leaves != 0) && !
is_leaf(parent_node)) { ++num_leaves; } CT_NODE_T x = create_child(parent_node,
has_labels(),
has_edge_lengths());
if(!(new_label.empty())) {
set_label(x,new_label); }
if(std::abs(new_length) > ZERO_THRESH) {
set_edge_length(x,new_length); }
return x; }
138 void print_newick(std::ostream & out, CT_NODE_T node = ROOT_NODE,
bool include_semicolon =
true);
145 std::string
get_newick(CT_NODE_T node = ROOT_NODE,
bool include_semicolon =
true) { std::ostringstream oss;
print_newick(oss,node,include_semicolon);
return oss.str(); }
157 size_t get_num_leaves() {
if(num_leaves == 0) { calc_num_leaves(); }
return num_leaves; }
176 bool is_root(CT_NODE_T node)
const {
return node == ROOT_NODE; }
183 bool is_leaf(CT_NODE_T node)
const {
return children[node].size() == 0; }
190 CT_NODE_T
get_parent(CT_NODE_T node)
const {
return parent[node]; }
197 const std::vector<CT_NODE_T> &
get_children(CT_NODE_T node)
const {
return children[node]; }
209 void clear_edge_lengths(
bool shrink =
true) {
if(!length.empty()) { length.clear();
if(shrink) { length.shrink_to_fit(); } } }
242 void clear_labels(
bool shrink =
true) {
if(!label.empty()) { label.clear();
if(shrink) { label.shrink_to_fit(); } } }
256 void set_label(CT_NODE_T node,
const std::string & new_label) {
264 const std::vector<std::string> &
get_labels()
const {
return label; }
271 void replace_labels(
const std::unordered_map<std::string, std::string> & label_map,
bool include_internal =
true) {
272 const size_t NUM_NODES =
get_num_nodes(); CT_NODE_T curr; std::unordered_map<std::string,std::string>::const_iterator it; std::unordered_map<std::string,std::string>::const_iterator it_end = label_map.cend();
273 for(curr = ROOT_NODE; curr < NUM_NODES; ++curr) {
274 if(
is_leaf(curr) || include_internal) {
275 it = label_map.find(label[curr]);
276 if(it != it_end) { label[curr] = it->second; }
286 std::tuple<const std::string*, CT_LENGTH_T, CT_NODE_T, const std::vector<CT_NODE_T>*>
operator[](CT_NODE_T i)
const {
return std::make_tuple(&
get_label(i),
get_edge_length(i), parent[i], &children[i]); }
301 bool operator==(
const preorder_iterator & rhs)
const {
return node == rhs.node; }
302 bool operator!=(
const preorder_iterator & rhs)
const {
return node != rhs.node; }
303 CT_NODE_T operator*() {
return node; }
333 CT_NODE_T operator*() {
return node; }
361 levelorder_iterator & operator++() { CT_NODE_T x = q.front(); std::vector<CT_NODE_T>::iterator it_end = tree_ptr->children[x].end();
for(std::vector<CT_NODE_T>::iterator it = tree_ptr->children[x].begin(); it != it_end; ++it) { q.push(*it); } q.pop();
return *
this; }
365 CT_NODE_T operator*() {
return q.front(); }
393 bool operator==(
const leaves_iterator & rhs)
const {
return node == rhs.node; }
394 bool operator!=(
const leaves_iterator & rhs)
const {
return node != rhs.node; }
395 CT_NODE_T operator*() {
return node; }
416 std::vector<CT_NODE_T>::iterator it;
425 CT_NODE_T operator*() {
return *it; }
447 CT_NODE_T
find_mrca(
const std::unordered_set<CT_NODE_T> & nodes)
const;
462 double calc_total_bl(
bool include_internal =
true,
bool include_leaves =
true)
const {
463 if(!(include_internal || include_leaves)) {
return 0.; }
465 for(CT_NODE_T node = 0; node < num_nodes; ++node) {
478 double calc_avg_bl(
bool include_internal =
true,
bool include_leaves =
true)
const {
479 if(!(include_internal || include_leaves)) {
return 0.; }
480 double tot = 0; CT_NODE_T num_nodes =
get_num_nodes(); CT_NODE_T count = 0;
481 for(CT_NODE_T node = 0; node < num_nodes; ++node) {
483 else {
if(include_internal) { tot +=
get_edge_length(node); ++count; } }
496 std::unordered_map<CT_NODE_T, double> u_dists; std::unordered_map<CT_NODE_T, double> v_dists; CT_NODE_T c = u; CT_NODE_T p = parent[c]; std::unordered_map<CT_NODE_T, double>::iterator it;
497 while(p != NULL_NODE) { u_dists[p] = u_dists[c] +
get_edge_length(c); c = p; p = parent[p]; } c = v; p = parent[c];
498 while(p != NULL_NODE) { v_dists[p] = v_dists[c] +
get_edge_length(c); it = u_dists.find(p);
if(it != u_dists.end()) {
return it->second + v_dists[p]; } c = p; p = parent[p]; }
511 std::vector<CT_NODE_T>::iterator it_begin = children[node].begin(); std::vector<CT_NODE_T>::iterator it_end = children[node].end();
512 for(std::vector<CT_NODE_T>::iterator it = it_begin; it != it_end; ++it) { out << ((it == it_begin) ?
'(' :
',');
print_newick(out, *it,
false); }
513 if(!
is_leaf(node)) { out <<
')'; }
514 if(label.size() != 0) { out << label[node]; }
515 if(length.size() != 0) { out <<
':' << length[node]; }
516 if(include_semicolon) { out <<
';'; }
521 std::queue<CT_NODE_T, std::deque<CT_NODE_T>> to_visit(std::deque<CT_NODE_T>(nodes.begin(), nodes.end()));
522 std::unordered_map<CT_NODE_T, CT_NODE_T> count; std::unordered_map<CT_NODE_T, CT_NODE_T>::iterator count_it;
523 CT_NODE_T curr_node; CT_NODE_T curr_parent;
size_t total = nodes.size();
524 while(!to_visit.empty()) {
525 curr_node = to_visit.front(); to_visit.pop(); count_it = count.find(curr_node); curr_parent = parent[curr_node];
526 if(count_it == count.end()) {
527 count_it = count.emplace(curr_node, 0).first;
529 if((++(count_it->second)) == total) {
532 if(curr_parent != NULL_NODE) {
533 to_visit.emplace(curr_parent);
543 const size_t N_CHOOSE_2 = ((N%2) == 0) ? ((N/2) * N_MINUS_1) : ((N_MINUS_1/2) * N);
544 std::vector<std::pair<std::pair<CT_NODE_T,CT_NODE_T>,
double>> dm; dm.reserve(N_CHOOSE_2);
545 std::unordered_map<CT_NODE_T, std::unordered_map<CT_NODE_T, double>*> leaf_dists;
546 std::vector<CT_NODE_T>::iterator ch_it; std::vector<CT_NODE_T>::iterator ch_it_2; std::vector<CT_NODE_T>::iterator ch_it_end;
547 CT_NODE_T curr; CT_NODE_T child; CT_NODE_T child_2; std::unordered_map<CT_NODE_T, double>* curr_leaf_dists;
548 std::unordered_map<CT_NODE_T, double>* child_leaf_dists; std::unordered_map<CT_NODE_T, double>::iterator dist_it; std::unordered_map<CT_NODE_T, double>::iterator dist_it_end;
549 std::unordered_map<CT_NODE_T, double>* child_leaf_dists_2; std::unordered_map<CT_NODE_T, double>::iterator dist_it_2; std::unordered_map<CT_NODE_T, double>::iterator dist_it_2_end;
552 for(curr = NUM_NODES-1; curr != NULL_NODE; --curr) {
553 curr_leaf_dists =
new std::unordered_map<CT_NODE_T, double>();
554 leaf_dists.emplace(curr, curr_leaf_dists);
558 curr_leaf_dists->emplace(curr, (
double)0.);
564 for(ch_it = children[curr].begin(), ch_it_end = children[curr].end(); std::next(ch_it) != ch_it_end; ++ch_it) {
565 child = *ch_it; child_leaf_dists = leaf_dists[child];
566 for(ch_it_2 = std::next(ch_it); ch_it_2 != ch_it_end; ++ch_it_2) {
567 child_2 = *ch_it_2; child_leaf_dists_2 = leaf_dists[child_2];
568 for(dist_it = child_leaf_dists->begin(), dist_it_end = child_leaf_dists->end(); dist_it != dist_it_end; ++dist_it) {
569 for(dist_it_2 = child_leaf_dists_2->begin(), dist_it_2_end = child_leaf_dists_2->end(); dist_it_2 != dist_it_2_end; ++dist_it_2) {
570 dm.emplace_back(std::make_pair(std::make_pair(dist_it->first,dist_it_2->first), dist_it->second + dist_it_2->second +
get_edge_length(child) +
get_edge_length(child_2)));
577 for(ch_it = children[curr].begin(), ch_it_end = children[curr].end(); ch_it != ch_it_end; ++ch_it) {
578 child = *ch_it; child_leaf_dists = leaf_dists[child];
579 for(dist_it = child_leaf_dists->begin(), dist_it_end = child_leaf_dists->end(); dist_it != dist_it_end; ++dist_it) {
580 curr_leaf_dists->emplace(dist_it->first, dist_it->second + length[child]);
582 delete child_leaf_dists;
586 delete leaf_dists[ROOT_NODE];
return dm;
590 CT_NODE_T compact_tree::create_child(
const CT_NODE_T parent_node,
bool store_labels,
bool store_lengths) {
591 tmp_node = parent.size(); parent.emplace_back(parent_node); children.emplace_back(std::vector<CT_NODE_T>());
592 if(parent_node != NULL_NODE) { children[parent_node].emplace_back(tmp_node); }
593 if(store_lengths) { length.emplace_back((CT_LENGTH_T)0); }
594 if(store_labels) { label.emplace_back(
""); }
600 compact_tree new_tree; std::stack<std::pair<CT_NODE_T, CT_NODE_T>> to_copy; to_copy.push(std::make_pair(node,0));
601 bool has_lengths = (length.size() != 0);
bool has_labels = (label.size() != 0); new_tree.create_child(NULL_NODE,
has_labels, has_lengths);
602 if(has_lengths) { new_tree.length.emplace_back(ZERO_LENGTH); }
603 if(
has_labels) { new_tree.label.emplace_back(EMPTY_STRING); }
604 std::pair<CT_NODE_T, CT_NODE_T> curr;
605 while(!to_copy.empty()) {
606 curr = to_copy.top(); to_copy.pop(); std::vector<CT_NODE_T>::iterator it_end = children[curr.first].end();
607 if(has_lengths) { new_tree.length[curr.second] = length[curr.first]; }
608 if(
has_labels) { new_tree.label[curr.second] = label[curr.first]; }
609 for(std::vector<CT_NODE_T>::iterator it = children[curr.first].begin(); it != it_end; ++it) {
610 to_copy.push(std::make_pair(*it, new_tree.create_child(curr.second,
has_labels, has_lengths)));
617 compact_tree::compact_tree(
const char*
const input,
bool is_fn,
bool store_labels,
bool store_lengths,
size_t reserve) {
619 if(reserve != 0) { parent.reserve(reserve);
if(store_lengths) { length.reserve(reserve); }
if(store_labels) { label.reserve(reserve); } }
623 size_t bytes_read = 0;
size_t i;
624 const size_t INPUT_LEN = strlen(input);
size_t input_i = 0;
625 char buf[IO_BUFFER_SIZE + 1];
626 char str_buf[STR_BUFFER_SIZE] = {};
size_t str_buf_i = 0;
628 fd = open(input, O_RDONLY);
630 throw std::invalid_argument(ERROR_OPENING_FILE + std::string(
": ") + std::string(input));
632 posix_fadvise(fd, 0, 0, 1);
636 create_child(NULL_NODE, store_labels, store_lengths);
637 CT_NODE_T curr_node = ROOT_NODE;
638 bool parse_length =
false;
639 bool parse_label =
false;
640 bool parse_label_single =
false;
641 bool parse_label_double =
false;
642 bool parse_comment =
false;
648 bytes_read = read(fd, buf, IO_BUFFER_SIZE);
650 if(input_i == INPUT_LEN) {
653 if((input_i + IO_BUFFER_SIZE) <= INPUT_LEN) {
654 bytes_read = IO_BUFFER_SIZE;
656 bytes_read = INPUT_LEN - input_i;
658 memcpy(buf, input+input_i, bytes_read); input_i += bytes_read;
663 if(!bytes_read || bytes_read == (
size_t)(-1)) {
664 throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(
": ") + std::string(input));
668 for(i = 0; i < bytes_read; ++i) {
673 parse_comment =
false;
678 else if(parse_length) {
684 if(store_lengths) { str_buf[str_buf_i] = (char)0; length[curr_node] = PARSE_LENGTH_FROM_C_STR(str_buf); }
685 parse_length =
false; --i;
break;
689 parse_comment =
true;
break;
693 if(store_lengths) { str_buf[str_buf_i++] = buf[i]; }
699 else if(parse_label) {
701 if(parse_label_single || parse_label_double) {
702 if(store_labels) { str_buf[str_buf_i++] = buf[i]; }
703 if(buf[i] ==
'\'' && parse_label_single) {
704 parse_label =
false; parse_label_single =
false;
705 }
else if(buf[i] ==
'"' && parse_label_double) {
706 parse_label =
false; parse_label_double =
false;
713 case ':':
case ',':
case ')':
case ';':
714 parse_label =
false; --i;
break;
716 if(store_labels) { str_buf[str_buf_i++] = buf[i]; }
break;
722 if(store_labels) { str_buf[str_buf_i] = (char)0; label[curr_node] = str_buf; }
736 if(curr_node != (CT_NODE_T)0) {
737 throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(
": ") + std::string(input));
743 if(curr_node == NULL_NODE) {
744 throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(
": ") + std::string(input));
746 curr_node = create_child(curr_node, store_labels, store_lengths);
break;
750 curr_node = parent[curr_node];
break;
754 if((curr_node == NULL_NODE) || (parent[curr_node] == NULL_NODE)) {
755 throw std::invalid_argument((is_fn ? ERROR_INVALID_NEWICK_FILE : ERROR_INVALID_NEWICK_STRING) + std::string(
": ") + std::string(input));
757 curr_node = create_child(parent[curr_node], store_labels, store_lengths);
break;
761 parse_comment =
true;
break;
765 if(store_lengths) { str_buf_i = 0; }
766 parse_length =
true;
break;
770 if(store_labels) { str_buf_i = 0; }
771 parse_label =
true; parse_label_single =
true;
break;
775 if(store_labels) { str_buf_i = 0; }
776 parse_label =
true; parse_label_double =
true;
break;
780 if(store_labels) { str_buf_i = 0; }
781 parse_label =
true; --i;
break;
Definition: compact_tree.h:414
Definition: compact_tree.h:384
Definition: compact_tree.h:352
Definition: compact_tree.h:322
Definition: compact_tree.h:292
Definition: compact_tree.h:61
const std::vector< std::string > & get_labels() const
Definition: compact_tree.h:264
CT_NODE_T get_root() const
Definition: compact_tree.h:169
bool has_edge_lengths() const
Definition: compact_tree.h:203
const std::string & get_label(CT_NODE_T node) const
Definition: compact_tree.h:249
CT_NODE_T find_mrca(const std::unordered_set< CT_NODE_T > &nodes) const
Definition: compact_tree.h:520
double calc_dist(CT_NODE_T u, CT_NODE_T v) const
Definition: compact_tree.h:494
postorder_iterator postorder_begin()
Definition: compact_tree.h:339
bool is_leaf(CT_NODE_T node) const
Definition: compact_tree.h:183
void clear_labels(bool shrink=true)
Definition: compact_tree.h:242
const std::vector< CT_LENGTH_T > & get_edge_lengths() const
Definition: compact_tree.h:221
leaves_iterator leaves_end()
Definition: compact_tree.h:408
void set_edge_length(CT_NODE_T node, CT_LENGTH_T new_length)
Definition: compact_tree.h:228
children_iterator children_end(CT_NODE_T node)
Definition: compact_tree.h:440
size_t get_num_leaves()
Definition: compact_tree.h:157
size_t get_num_nodes() const
Definition: compact_tree.h:151
compact_tree(const std::string &input, bool is_fn=true, bool store_labels=true, bool store_lengths=true, size_t reserve=0)
Definition: compact_tree.h:115
std::string get_newick(CT_NODE_T node=ROOT_NODE, bool include_semicolon=true)
Definition: compact_tree.h:145
CT_NODE_T get_parent(CT_NODE_T node) const
Definition: compact_tree.h:190
double calc_total_bl(bool include_internal=true, bool include_leaves=true) const
Definition: compact_tree.h:462
levelorder_iterator levelorder_begin()
Definition: compact_tree.h:372
preorder_iterator preorder_end()
Definition: compact_tree.h:316
bool is_root(CT_NODE_T node) const
Definition: compact_tree.h:176
CT_LENGTH_T get_edge_length(CT_NODE_T node) const
Definition: compact_tree.h:216
preorder_iterator preorder_begin()
Definition: compact_tree.h:310
void replace_labels(const std::unordered_map< std::string, std::string > &label_map, bool include_internal=true)
Definition: compact_tree.h:271
double calc_avg_bl(bool include_internal=true, bool include_leaves=true) const
Definition: compact_tree.h:478
levelorder_iterator levelorder_end()
Definition: compact_tree.h:378
void set_label(CT_NODE_T node, const std::string &new_label)
Definition: compact_tree.h:256
void print_newick(std::ostream &out, CT_NODE_T node=ROOT_NODE, bool include_semicolon=true)
Definition: compact_tree.h:510
bool has_labels() const
Definition: compact_tree.h:236
postorder_iterator postorder_end()
Definition: compact_tree.h:345
std::tuple< const std::string *, CT_LENGTH_T, CT_NODE_T, const std::vector< CT_NODE_T > * > operator[](CT_NODE_T i) const
Definition: compact_tree.h:286
compact_tree(const compact_tree &o)
Definition: compact_tree.h:121
children_iterator children_begin(CT_NODE_T node)
Definition: compact_tree.h:433
size_t get_num_internal()
Definition: compact_tree.h:163
const std::vector< CT_NODE_T > & get_children(CT_NODE_T node) const
Definition: compact_tree.h:197
std::vector< std::pair< std::pair< CT_NODE_T, CT_NODE_T >, double > > calc_distance_matrix()
Definition: compact_tree.h:540
leaves_iterator leaves_begin()
Definition: compact_tree.h:402
CT_NODE_T add_child(const CT_NODE_T parent_node, const std::string &new_label=EMPTY_STRING, CT_LENGTH_T new_length=ZERO_LENGTH)
Definition: compact_tree.h:130
compact_tree extract_subtree(CT_NODE_T node)
Definition: compact_tree.h:599
void clear_edge_lengths(bool shrink=true)
Definition: compact_tree.h:209