Source code for deepr.vocab.base

"""Utilities for vocabularies."""

from typing import Iterable, List

from deepr.io.path import Path


[docs]def write(path: str, vocab: Iterable[str]): """Write vocabulary to file. Parameters ---------- path : str Path to .txt file with one item per line vocab : Iterable[str] Iterable of lexemes (strings) Lexemes should not have newline characters. """ # Check that vocab is not a string if isinstance(vocab, str): msg = f"Expected iterable of strings, but got a string ({vocab})" raise TypeError(msg) # Check that no item in vocab has a newline character for item in vocab: if not isinstance(item, str): msg = f"Expected item of type str, but got {type(item)} for item {item}" raise TypeError(msg) if "\n" in item: msg = f"Found newline character in item {item} (forbidden)." raise ValueError(msg) # Write each item on a new line with Path(path).open("w") as file: file.write("\n".join(vocab))
[docs]def read(path: str) -> List[str]: """Read vocabulary from file. Parameters ---------- path : str Path to .txt file with one item per line """ with Path(path).open() as file: return [line.strip() for line in file if line.strip()]
[docs]def size(path: str) -> int: """Return vocabulary size from mapping file. Parameters ---------- path : str Path to .txt file with one item per line """ return len(read(path))