Skip to content

API reference

ioc_typing.IOCClassifier

Classifier for identifying various types of IOCs (Indicators of Compromise).

This class provides functionality to classify different types of indicators commonly found in threat intelligence and security analysis, including:

  • IP addresses (IPv4 and IPv6)
  • Domain names
  • URLs (including various protocols and complex structures)
  • Cryptographic hashes (MD5, SHA1, SHA256)

The classifier uses regular expressions to identify and validate each type of indicator. For each input, it returns a dictionary containing: - determined: Boolean indicating if the input was successfully classified - type_pri: Primary type (e.g., "ip", "domain", "url", "hash") - type_sec: Secondary type where applicable (e.g. "v4"/"v6" for IPs, hash types)

The class is designed to be strict in its classifications to minimize false positives, while still being flexible enough to handle common variations in format.

Source code in src/ioc_typing/ioc_classifier.py
class IOCClassifier:
    """
    Classifier for identifying various types of IOCs (Indicators of Compromise).

    This class provides functionality to classify different types of indicators commonly
    found in threat intelligence and security analysis, including:

    - IP addresses (IPv4 and IPv6)
    - Domain names
    - URLs (including various protocols and complex structures)
    - Cryptographic hashes (MD5, SHA1, SHA256)

    The classifier uses regular expressions to identify and validate each type of
    indicator.
    For each input, it returns a dictionary containing:
        - determined: Boolean indicating if the input was successfully classified
        - type_pri: Primary type (e.g., "ip", "domain", "url", "hash")
        - type_sec: Secondary type where applicable (e.g. "v4"/"v6" for IPs, hash types)

    The class is designed to be strict in its classifications to minimize false
    positives, while still being flexible enough to handle common variations in format.
    """

    def __init__(self) -> None:
        self.patterns = self._compile_patterns()

    def _compile_patterns(self) -> dict[str, re.Pattern[str]]:
        """
        Compile all regex patterns used for classification.
        Returns a dictionary of compiled patterns for better performance and
        organization.
        """
        # IP patterns
        ipv4_pattern = (
            r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}"
            r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$"
        )

        ipv6_pattern = (
            r"^(?:"
            r"(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}|"
            r"(?:[0-9a-fA-F]{1,4}:){1,7}:|"
            r"(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|"
            r"(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|"
            r"(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|"
            r"(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|"
            r"(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|"
            r"[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|"
            r":(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|"
            r"fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|"
            r"::(?:ffff(?::0{1,4}){0,1}:){0,1}"
            r"(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}"
            r"(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|"
            r"(?:[0-9a-fA-F]{1,4}:){1,4}:"
            r"(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}"
            r"(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]))$"
        )

        # Hash patterns
        hash_patterns = {
            "md5": r"^[a-fA-F0-9]{32}$",
            "sha1": r"^[a-fA-F0-9]{40}$",
            "sha256": r"^[a-fA-F0-9]{64}$",
        }

        # Domain pattern (trailing dot is allowed — RFC 1034 FQDN form)
        domain_pattern = (
            r"^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)" r"+[a-zA-Z]{2,}\.?$"
        )

        # URL components
        url_components = {
            "protocol": r"(?:(?:https?|ftp|sftp|ftps|ssh|git|file)://)",
            "auth": r"(?:(?:[A-Za-z0-9\-._~!$&\'()*+,;=:]|%[0-9A-Fa-f]{2})*@)",
            "host": (
                r"(?:(?:[A-Za-z0-9](?:[A-Za-z0-9\-]{0,61}[A-Za-z0-9])?\.)"
                r"+[A-Za-z]{2,}\.?|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"
            ),
            "port": r"(?::\d+)",
            "path": r"(?:/[^\s?#]*)",
            "query": r"(?:\?[A-Za-z0-9\-._~!$&\'()*+,;=:@%/\[\]]*)",
            "fragment": r"(?:#[^\s]*)",
        }

        # Construct URL pattern
        url_pattern = (
            r"^(?:"
            + url_components["protocol"]
            + url_components["auth"]
            + r"?"
            + url_components["host"]
            + url_components["port"]
            + r"?"
            + url_components["path"]
            + r"?"
            + url_components["query"]
            + r"?"
            + url_components["fragment"]
            + r"?|"
            + r"(?!"
            + url_components["protocol"]
            + r")"
            + url_components["host"]
            + r"(?="
            + url_components["port"]
            + r"|"
            + url_components["path"]
            + r"|"
            + url_components["query"]
            + r"|"
            + url_components["fragment"]
            + r")"
            + url_components["port"]
            + r"?"
            + url_components["path"]
            + r"?"
            + url_components["query"]
            + r"?"
            + url_components["fragment"]
            + r"?"
            + r")$"
        )

        # Compile all patterns
        return {
            "ipv4": re.compile(ipv4_pattern),
            "ipv6": re.compile(ipv6_pattern),
            "md5": re.compile(hash_patterns["md5"]),
            "sha1": re.compile(hash_patterns["sha1"]),
            "sha256": re.compile(hash_patterns["sha256"]),
            "domain": re.compile(domain_pattern),
            "url": re.compile(url_pattern, re.IGNORECASE),
        }

    def classify(self, query: str) -> ClassificationResult:
        """
        Classify a given string into various cybersecurity-related types.

        Args:
            query (str): The string to classify

        Returns:
            dict: Classification result with query, determined status and type info
        """

        # Check IP addresses first (most specific)
        if self.patterns["ipv4"].fullmatch(query):
            return self._create_result(query, "ip", "v4")
        if self.patterns["ipv6"].fullmatch(query):
            return self._create_result(query, "ip", "v6")

        # Check hashes (specific patterns)
        for hash_type in ["md5", "sha1", "sha256"]:
            if self.patterns[hash_type].fullmatch(query):
                return self._create_result(query, "hash", hash_type)

        # Check URL before domain (URLs are more specific)
        if self.patterns["url"].fullmatch(query):
            return self._create_result(query, "url", None)

        # Check domain last (most general)
        if self.patterns["domain"].fullmatch(query):
            return self._create_result(query, "domain", None)

        return {"query": query, "determined": False, "type_pri": None, "type_sec": None}

    def _create_result(
        self, query: str, type_pri: str, type_sec: str | None
    ) -> ClassificationResult:
        """Helper method to create a result dictionary."""
        return {
            "query": query,
            "determined": True,
            "type_pri": type_pri,
            "type_sec": type_sec,
        }

classify

classify(query: str) -> ClassificationResult

Classify a given string into various cybersecurity-related types.

Parameters:

Name Type Description Default
query str

The string to classify

required

Returns:

Name Type Description
dict ClassificationResult

Classification result with query, determined status and type info

Source code in src/ioc_typing/ioc_classifier.py
def classify(self, query: str) -> ClassificationResult:
    """
    Classify a given string into various cybersecurity-related types.

    Args:
        query (str): The string to classify

    Returns:
        dict: Classification result with query, determined status and type info
    """

    # Check IP addresses first (most specific)
    if self.patterns["ipv4"].fullmatch(query):
        return self._create_result(query, "ip", "v4")
    if self.patterns["ipv6"].fullmatch(query):
        return self._create_result(query, "ip", "v6")

    # Check hashes (specific patterns)
    for hash_type in ["md5", "sha1", "sha256"]:
        if self.patterns[hash_type].fullmatch(query):
            return self._create_result(query, "hash", hash_type)

    # Check URL before domain (URLs are more specific)
    if self.patterns["url"].fullmatch(query):
        return self._create_result(query, "url", None)

    # Check domain last (most general)
    if self.patterns["domain"].fullmatch(query):
        return self._create_result(query, "domain", None)

    return {"query": query, "determined": False, "type_pri": None, "type_sec": None}

ioc_typing.ClassificationResult

Bases: TypedDict

Shape of every value returned by :meth:IOCClassifier.classify.

Source code in src/ioc_typing/ioc_classifier.py
class ClassificationResult(TypedDict):
    """Shape of every value returned by :meth:`IOCClassifier.classify`."""

    query: str
    determined: bool
    type_pri: str | None
    type_sec: str | None