Source code for hed.schema.schema_validation_util

"""Utilities used in HED validation/loading using a HED schema."""

from hed.errors.error_reporter import ErrorHandler
from hed.errors.error_types import SchemaWarnings
from hed.schema import hed_schema_constants as constants
from hed.schema.hed_schema_constants import character_types
from hed.schema.hed_schema import HedSchema


[docs]def validate_schema_tag_new(hed_entry):
    """ Check tag entry for capitalization and illegal characters.

    Parameters:
        hed_entry (HedTagEntry): A single tag entry

    Returns:
        list: A list of all formatting issues found in the term. Each issue is a dictionary.
    """
    issues_list = []
    hed_term = hed_entry.short_tag_name
    # Any # terms will have already been validated as the previous entry.
    if hed_term == "#":
        return issues_list

    if hed_term and hed_term[0] and not (hed_term[0].isdigit() or hed_term[0].isupper()):
        issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION,
                                                 hed_term, char_index=0, problem_char=hed_term[0])
    issues_list += validate_schema_term_new(hed_entry, hed_term)
    return issues_list


[docs]def validate_schema_term_new(hed_entry, hed_term=None):
    """ Check the term for invalid character issues

    Parameters:
        hed_entry (HedSchemaEntry): A single schema entry
        hed_term (str or None): Use instead of hed_entry.name if present.

    Returns:
        list: A list of all formatting issues found in the term. Each issue is a dictionary.
    """
    if not hed_term:
        hed_term = hed_entry.name
    issues_list = []
    # todo: potentially optimize this someday, as most values are the same
    character_set = get_allowed_characters_by_name(["name"] +
                                                   hed_entry.attributes.get("allowedCharacter", "").split(","))
    indexes = get_problem_indexes(hed_term, character_set)
    for char, index in indexes:
        issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG,
                                                 hed_term, char_index=index, problem_char=char)
    return issues_list


[docs]def validate_schema_description_new(hed_entry):
    """ Check the description of the entry for invalid character issues

    Parameters:
        hed_entry (HedSchemaEntry): A single schema entry

    Returns:
        list: A list of all invalid characters found in description. Each issue is a dictionary.
    """
    if not hed_entry.description:
        return []
    issues_list = []
    character_set = get_allowed_characters_by_name(["text", "comma"])
    indexes = get_problem_indexes(hed_entry.description, character_set)
    # Kludge, just get short name here if we have it for error reporting
    name = hed_entry.name
    if hasattr(hed_entry, "short_tag_name"):
        name = hed_entry.short_tag_name
    for char, index in indexes:

        issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC,
                                                 hed_entry.description, name, problem_char=char, char_index=index)
    return issues_list


[docs]def schema_version_for_library(hed_schema, library_name):
    """ Given the library name and hed schema object, return the version

    Parameters:
        hed_schema (HedSchema): the schema object
        library_name (str or None): The library name you're interested in.  "" for the standard schema.

    Returns:
        version_number (str): The version number of the given library name.  Returns None if unknown library_name.
    """
    if library_name is None:
        library_name = ""
    names = hed_schema.library.split(",")
    versions = hed_schema.version_number.split(",")
    for name, version in zip(names, versions):
        if name == library_name:
            return version

    # Return the partnered schema version
    if library_name == "" and hed_schema.with_standard:
        return hed_schema.with_standard
    return None


[docs]def get_allowed_characters(value_classes):
    """Returns the allowed characters in a given container of value classes

    Parameters:
        value_classes(list of HedSchemaEntry): A list of schema entries that should have the allowedCharacter attribute

    Returns:
        character_set(set): The set of all characters from the given classes
    """
    # This could be pre-computed
    character_set_names = []

    for value_class in value_classes:
        allowed_types = value_class.attributes.get(constants.HedKey.AllowedCharacter, "").split(",")
        character_set_names.extend(allowed_types)

    character_set = get_allowed_characters_by_name(character_set_names)
    # for now, just always allow these special cases(it's validated extensively elsewhere)
    character_set.update("#/")
    return character_set


[docs]def get_allowed_characters_by_name(character_set_names):
    """Returns the allowed characters from a list of character set names

    Note: "nonascii" is a special case "character" that can be included as well

    Parameters:
        character_set_names(list of str): A list of character sets to allow.  See hed_schema_constants.character_types

    Returns:
        character_set(set): The set of all characters from the names
    """
    character_set = set()
    for name in character_set_names:
        if name in character_types and name != "nonascii":
            character_set.update(character_types[name])
        else:
            character_set.add(name)
    return character_set


[docs]def get_problem_indexes(validation_string, character_set, index_adj=0):
    """Finds indexes with values not in character set

    Parameters:
        validation_string(str): The string to check characters in
        character_set(set): the list of valid characters(or the value "nonascii" as a set entry)
        index_adj(int): the value to adjust the reported indices by, if this isn't the start of a string.

    Returns:
        index_list(tuple of (str, int)): The list of problematic characters and indices
    """
    if not character_set:
        return []

    indexes = [(char, index + index_adj) for index, char in enumerate(validation_string) if char not in character_set]
    if "nonascii" in character_set:
        indexes = [(char, index) for char, index in indexes if not ord(char) > 127]

    return indexes