src/wrappers/PhylogenyWrapper.js

/**
 * PhylogenyWrapper
 */

const {
  has,
  cloneDeep,
} = require('lodash');

/** Used to parse Newick strings. */
const newickJs = require('newick-js');

/** OWL terms to be used here. */
const owlterms = require('../utils/owlterms');

const { TaxonomicUnitWrapper } = require('./TaxonomicUnitWrapper');
const { TaxonomicUnitMatcher } = require('../matchers/TaxonomicUnitMatcher');
const { CitationWrapper } = require('./CitationWrapper');

class PhylogenyWrapper {
  // Wraps a Phylogeny in a PHYX file and provides access to node, node labels
  // and other information. Remember that a Phylogeny also has the
  // additionalNodeProperties object which provides additional properties for
  // nodes.

  constructor(phylogeny, defaultNomenCode = owlterms.UNKNOWN_CODE) {
    // Construct a phylogeny based on a Phylogeny object in a PHYX phylogeny.
    // Note that this version ONLY uses the `newick` property to determine the
    // phylogeny: if other representations are included (such as a node-based
    // format, as used in JSON-LD), they will be ignored and possibly overwritten
    // during export. So, to update the phylogeny, please only update the newick
    // string!
    //
    // This ensures that we don't need to reconcile between different
    // possible representations of a phylogeny.
    this.phylogeny = phylogeny;
    this.defaultNomenCode = defaultNomenCode;
  }

  /**
   * Return a normalized form of the phylogeny.
   */
  static normalize(phylogeny) {
    const normalizedPhylogeny = cloneDeep(phylogeny);

    // We could normalize the Newick string, but that doesn't seem very nice.

    // Normalize the source if there is one.
    if ('source' in phylogeny) {
      normalizedPhylogeny.source = CitationWrapper.normalize(phylogeny.source || {});
    }

    return normalizedPhylogeny;
  }

  static getErrorsInNewickString(newick) {
    // Given a Newick string, return a list of errors found in parsing this
    // string. The errors are returned as a list of objects, each of which
    // has two properties:
    //  - title: A short title of the error, distinct for each type of error.
    //  - message: A longer description of the error, which might include
    //    information specific to a particular error.
    //
    // We try to order errors from most helpful ('Unbalanced parentheses in
    // Newick string') to least helpful ('Error parsing phylogeny').
    const newickTrimmed = newick.trim();
    const errors = [];

    // Look for an empty Newick string.
    if (newickTrimmed === '' || newickTrimmed === '()' || newickTrimmed === '();') {
      // None of the later errors are relevant here, so bail out now.
      return [{
        title: 'No phylogeny entered',
        message: 'Click on "Edit as Newick" to enter a phylogeny below.',
      }];
    }

    // Look for an unbalanced Newick string.
    let parenLevels = 0;
    for (let x = 0; x < newickTrimmed.length; x += 1) {
      if (newickTrimmed[x] === '(') parenLevels += 1;
      if (newickTrimmed[x] === ')') parenLevels -= 1;
    }

    if (parenLevels !== 0) {
      errors.push({
        title: 'Unbalanced parentheses in Newick string',
        message: (parenLevels > 0
          ? `You have ${parenLevels} too many open parentheses`
          : `You have ${-parenLevels} too few open parentheses`
        ),
      });
    }

    // Finally, try parsing it with newickJs.parse() and see if we get an error.
    try {
      newickJs.parse(newickTrimmed);
    } catch (ex) {
      errors.push({
        title: 'Error parsing phylogeny',
        message: `An error occured while parsing this phylogeny: ${ex.message}`,
      });
    }

    return errors;
  }

  static recurseNodes(node, func, nodeCount = 0, parentCount = undefined) {
    // Recurse through PhyloTree nodes, executing function on each node.
    //  - node: The node to recurse from. The function will be called on node
    //          *before* being called on its children.
    //  - func: The function to call on `node` and all of its children.
    //  - nodeCount: `node` will be called with this nodeCount. All of its
    //          children will be called with consecutively increasing nodeCounts.
    //  - parentCount: The nodeCount associated with the parent of this node
    //          within this run of recurseNodes. For instance, immediate children
    //          of `node` will have a parentCount of 0. By default, `node` itself
    //          will have a parentCount of `undefined`.
    // When the function `func` is called, it is given three arguments:
    //  - The current node object (initially: `node`)
    //  - The count of the current node object (initially: `nodeCount`)
    //  - The parent count of the current node object (initially: `parentCount`)
    func(node, nodeCount, parentCount);

    let nextID = nodeCount + 1;

    // Recurse through all children of this node.
    if (has(node, 'children')) {
      node.children.forEach((child) => {
        nextID = PhylogenyWrapper.recurseNodes(
          child,
          func,
          nextID,
          nodeCount
        );
      });
    }

    return nextID;
  }

  getTaxonomicUnits(nodeType = 'both') {
    // Return a list of all taxonomic units in this phylogeny.
    // Node labels will be extracted from:
    //  - internal nodes only (if nodeType == 'internal')
    //  - terminal nodes only (if nodeType == 'terminal')
    //  - both internal and terminal nodes (if nodeType == 'both')
    //
    // See `getTaxonomicUnitsForNodeLabel` to see how node labels are converted
    // into node labels, but in brief:
    //  1. We look for taxonomic units in the additionalNodeProperties.
    //  2. If none are found, we attempt to parse the node label as a scientific name.
    //
    const nodeLabels = this.getNodeLabels(nodeType);
    const tunits = new Set();

    nodeLabels.forEach(
      nodeLabel => this.getTaxonomicUnitsForNodeLabel(nodeLabel)
        .forEach(tunit => tunits.add(tunit))
    );

    return tunits;
  }

  getNodeLabels(nodeType = 'both') {
    // Return a list of all the node labels in this phylogeny.
    //
    // nodeType can be one of:
    // - 'internal': Return node labels on internal nodes.
    // - 'terminal': Return node labels on terminal nodes.
    // - 'both': Return node labels on both internal and terminal nodes.

    // Parse the phylogeny (will throw an exception if parsing failed).
    const { graph } = newickJs.parse(this.phylogeny.newick || '()');
    const [vertices, arcs] = graph;

    if (nodeType === 'both') {
      // Return all node labels.
      return Array.from(
        new Set(
          Array.from(vertices)
            .map(vertex => vertex.label)
            .filter(label => label !== undefined)
        )
      );
    }

    if (nodeType === 'internal') {
      // Return the internal nodes (those with atleast one child).
      return Array.from(new Set(
        Array.from(arcs)
          .map(arc => arc[0].label) // Retrieve the label of the parent vertex in this arc.
          .filter(label => label !== undefined)
      ));
    }

    if (nodeType === 'terminal') {
      // Return the terminal nodes. This would require calculating the children
      // of every vertex in the graph and then identifying vertices without any
      // children.
      //
      // A quicker and dirtier way to do this is by removing internal labels
      // from the list of all node labels. This will report an incorrect result
      // if an internal node has the same label as a terminal node, but at that
      // point a lot of other assumptions are going to fail, too, so this is
      // probably good enough for now.
      const allLabels = this.getNodeLabels('both');
      const internalLabels = new Set(this.getNodeLabels('internal'));

      return allLabels.filter(label => !internalLabels.has(label));
    }

    throw new Error(`Unknown nodeType: '${nodeType}'`);
  }

  /**
   * Return a list of taxonomic units for a node label.
   *
   * If the additionalNodeProperties for this node label includes taxonomic units
   * (using `representsTaxonomicUnits` = obo:CDAO_0000187), then those taxonomic
   * units are used. Otherwise, one will be constructed using the default
   * nomenclatural code set up when this PhylogenyWrapper was set up.
   */
  getTaxonomicUnitsForNodeLabel(nodeLabel) {
    // Look up additional node properties.
    let additionalNodeProperties = {};
    if (
      has(this.phylogeny, 'additionalNodeProperties')
      && has(this.phylogeny.additionalNodeProperties, nodeLabel)
    ) {
      additionalNodeProperties = this.phylogeny.additionalNodeProperties[nodeLabel];
    }

    // If there are explicit taxonomic units in the
    // representsTaxonomicUnits property, we need to use those.
    if (has(additionalNodeProperties, 'representsTaxonomicUnits')) {
      return additionalNodeProperties.representsTaxonomicUnits;
    }

    // If that doesn't work, we can try to extract scientific names from
    // the node label. Note that taxonomic units will NOT be extracted from
    // the label if there is a taxonomic unit present!
    //
    // Note that old-style taxonomic units were lists while new-style taxonomic
    // units are single objects. So we turn it into a single entry list here.
    const tunit = TaxonomicUnitWrapper.fromLabel(nodeLabel.trim(), this.defaultNomenCode);
    if (tunit) return [tunit];
    return []; // No TUnit? Return the empty list.
  }

  getNodeLabelsMatchedBySpecifier(specifier) {
    // Return a list of node labels matched by a given specifier on
    // a given phylogeny.

    return this.getNodeLabels().filter((nodeLabel) => {
      // Find all the taxonomic units associated with the specifier and
      // with the node.
      const nodeTUnits = this.getTaxonomicUnitsForNodeLabel(nodeLabel);

      // Attempt pairwise matches between taxonomic units in the specifier
      // and associated with the node.
      return nodeTUnits.some(
        tunit => new TaxonomicUnitMatcher(specifier, tunit).matched
      );
    });
  }

  static getParsedNewick(newick) {
    // We previously used phylotree.js's Newick parser to parse Newick into a
    // tree-like structure. However, this is difficult to integrate using NPM.
    // This method provides a similar facility using the newick-js library.
    //
    // Throws an exception if the Newick could not be parsed.
    const { graph, root, rootWeight } = newickJs.parse(newick);
    const [, arcs] = graph;

    // Go through the arcs, assigning 'children' to the appropriate parent node.
    arcs.forEach((arc) => {
      const [parent, child, weight] = arc;

      // Add child to parent.children.
      if (!has(parent, 'children')) parent.children = [];
      parent.children.push(child);

      // Phylotree.js uses 'name' instead of 'label'.
      if (has(parent, 'label')) { parent.name = parent.label; }
      if (has(child, 'label')) { child.name = child.label; }

      // Phylotree.js uses 'attribute' to store weights, so we'll store it there as well.
      if (!has(child, 'attribute') && !Number.isNaN(weight)) child.attribute = weight;
    });

    // Set root 'attribute' to root weight.
    if (!has(root, 'attribute') && !Number.isNaN(rootWeight)) root.attribute = rootWeight;

    return { json: root };
  }

  getParsedNewickWithIRIs(baseIRI, newickParser = PhylogenyWrapper.getParsedNewick) {
    // Return the parsed Newick string, but with EVERY node given an IRI.
    // - baseIRI: The base IRI to use for node elements (e.g. ':phylogeny1').
    //   Node IDs are generated by concatenating `_node${number}` to the end of
    //   the baseIRI.
    // - newickParser: A method for converting a Newick string to a object-based
    //   representation. The static method PhylogenyWrapper.getParsedNewick() is
    //   used if none is provided.

    const parsed = newickParser(this.phylogeny.newick || '()');
    if (has(parsed, 'json')) {
      PhylogenyWrapper.recurseNodes(parsed.json, (node, nodeCount) => {
        // Start with the additional node properties.
        const nodeAsJSONLD = node;

        // Set @id and @type.
        const nodeIRI = `${baseIRI}_node${nodeCount}`;
        nodeAsJSONLD['@id'] = nodeIRI;
      });
    }

    return parsed;
  }

  getNodesAsJSONLD(baseIRI, newickParser) {
    // Returns a list of all nodes in this phylogeny as a series of nodes.
    // - baseIRI: The base IRI to use for node elements (e.g. ':phylogeny1').
    //   Node IDs are generated by concatenating `_node${number}` to the end of
    //   the baseIRI.
    // - newickParser: A method for converting a Newick string to a object-based
    //   representation. See PhylogenyWrapper.getParsedNewick() for an example
    //   implementation.

    // List of nodes we have identified.
    const nodes = [];

    // We need to track the identifiers we give each node as we go.
    const nodesById = {};
    const nodeIdsByParentId = {};

    // Extract the newick string.
    const { additionalNodeProperties } = this.phylogeny;

    // Parse the Newick string; if parseable, recurse through the nodes,
    // added them to the list of JSON-LD nodes as we go.

    const parsed = this.getParsedNewickWithIRIs(baseIRI, newickParser);
    if (has(parsed, 'json')) {
      PhylogenyWrapper.recurseNodes(parsed.json, (node, nodeCount, parentCount) => {
        // Start with the additional node properties.
        const nodeAsJSONLD = {};

        // Set @id and @type. '@id' should already be set by getParsedNewickWithIRIs()!
        const nodeIRI = node['@id'];
        nodeAsJSONLD['@id'] = nodeIRI;

        // Since we may need to add multiple classes into the rdf:type, we need
        // to make @type an array. However, the JSON-LD library we use in JPhyloRef
        // can't support @type being an array (despite that being in the standard,
        // see https://w3c.github.io/json-ld-syntax/#example-14-specifying-multiple-types-for-a-node),
        // so we fall back to using rdf:type instead.
        nodeAsJSONLD[owlterms.RDF_TYPE] = [{ '@id': owlterms.CDAO_NODE }];

        // Add labels, additional node properties and taxonomic units.
        if (has(node, 'name') && node.name !== '') {
          // Add node label.
          nodeAsJSONLD.labels = [node.name];

          // Add additional node properties, if any.
          if (additionalNodeProperties && has(additionalNodeProperties, node.name)) {
            Object.keys(additionalNodeProperties[node.name]).forEach((key) => {
              nodeAsJSONLD[key] = additionalNodeProperties[node.name][key];
            });
          }

          // Add taxonomic units into the metadata.
          nodeAsJSONLD.representsTaxonomicUnits = this.getTaxonomicUnitsForNodeLabel(node.name);

          // Add it into the @type so we can reason over it.
          nodeAsJSONLD.representsTaxonomicUnits.forEach((tu) => {
            const wrappedTUnit = new TaxonomicUnitWrapper(tu);

            if (wrappedTUnit) {
              const equivClass = wrappedTUnit.asOWLEquivClass;
              if (equivClass) {
                nodeAsJSONLD[owlterms.RDF_TYPE].push(
                  {
                    '@type': 'owl:Restriction',
                    onProperty: owlterms.CDAO_REPRESENTS_TU,
                    someValuesFrom: equivClass,
                  }
                );
              }
            }
          });
        }

        // Add references to parents and siblings.
        if (parentCount !== undefined) {
          const parentIRI = `${baseIRI}_node${parentCount}`;
          nodeAsJSONLD.parent = parentIRI;

          // Update list of nodes by parent IDs.
          if (!has(nodeIdsByParentId, parentIRI)) {
            nodeIdsByParentId[parentIRI] = new Set();
          }
          nodeIdsByParentId[parentIRI].add(nodeIRI);
        }

        // Add nodeAsJSONLD to list
        if (has(nodesById, nodeIRI)) {
          throw new Error(`Error in programming: duplicate node IRI generated (${nodeIRI})`);
        }
        nodesById[nodeIRI] = nodeAsJSONLD;
        nodes.push(nodeAsJSONLD);
      });
    }

    // Go through nodes again and set children and sibling relationships.
    Object.keys(nodeIdsByParentId).forEach((parentId) => {
      // What are the children of this parentId?
      const childrenIDs = Array.from(nodeIdsByParentId[parentId]);
      const children = childrenIDs.map(childId => nodesById[childId]);

      // Is this the root node?
      if (has(nodesById, parentId)) {
        const parent = nodesById[parentId];
        parent.children = childrenIDs;
      }

      children.forEach((child) => {
        const childToModify = child;
        // Add all other sibling to node.siblings, but don't add this node itself!
        childToModify.siblings = childrenIDs.filter(childId => childId !== child['@id']);
      });
    });

    return nodes;
  }

  asJSONLD(fallbackIRI, newickParser) {
    // Export this phylogeny as JSON-LD.
    // - fallbackIRI: The fallback IRI to use for this phylogeny if it does not
    //   already have an '@id' set.
    // - newickParser: A function that parses a Newick string and returns a
    //   an object based representation of this phylogeny. If not set, the
    //   static method PhylogenyWrapper.getParsedNewick will be used instead.

    // Create a copy to export.
    const phylogenyAsJSONLD = JSON.parse(JSON.stringify(this.phylogeny));

    // Set name and class for phylogeny. If no '@id' is set, use the provided
    // fallbackIRI.
    if (!has(phylogenyAsJSONLD, '@id')) phylogenyAsJSONLD['@id'] = fallbackIRI;
    phylogenyAsJSONLD['@type'] = 'phyloref:ReferencePhylogenyEvidence';

    // Translate nodes into JSON-LD objects.
    phylogenyAsJSONLD.nodes = this.getNodesAsJSONLD(phylogenyAsJSONLD['@id'], newickParser);
    if (phylogenyAsJSONLD.nodes.length > 0) {
      // We don't have a better way to identify the root node, so we just
      // default to the first one.
      phylogenyAsJSONLD.hasRootNode = {
        '@id': phylogenyAsJSONLD.nodes[0]['@id'],
      };
    }

    // Add a bibliographicCitation to the source if it is a Citation.
    if (has(phylogenyAsJSONLD, 'source')) {
      const source = phylogenyAsJSONLD.source;

      if (!has(source, 'bibliographicCitation')) {
        source.bibliographicCitation = new CitationWrapper(source).toString();
      }
    }

    return phylogenyAsJSONLD;
  }
}

module.exports = {
  PhylogenyWrapper,
};