"""Algorithms for learning CCG entries from exmaples."""

import itertools
import collections
from typing import Any, Optional, Union, Iterable, Tuple, List, Callable
from dataclasses import dataclass
from jacinle.utils.printing import indent_text

from concepts.dsl.dsl_types import ConstantType
from concepts.dsl.dsl_functions import Function
from concepts.dsl.value import Value
from concepts.dsl.expression import ConstantExpression, FunctionApplicationExpression
from concepts.dsl.executors.executor_base import DSLExecutionError
from concepts.language.ccg.composition import CCGCompositionContext, CCGCompositionError
from concepts.language.ccg.semantics import CCGSemantics
from concepts.language.ccg.grammar import CCG, Lexicon, CCGNode
from import CCGSyntaxSearcherBase, CCGSyntaxEnumerativeSearcher, CCGSemanticsSearcherBase, CCGSemanticsEnumerativeSearcher
from import CCGSyntaxSearchResult, CCGSemanticsSearchResult

__all__ = ['CCGLearningResult', 'by_parsing', 'by_parsing_with_lexicon_gen', 'by_grounding']

[docs] @dataclass class CCGLearningResult(object): """The result of CCG learning.""" words: Tuple[str, ...] """The list of words learned by the algorithm.""" lexicons: Tuple[Lexicon, ...] """The list of lexicons learned by the algorithm. It is a tuple of lexicons that has the same length as `words`.""" parsing_results: Tuple[CCGNode, ...] """The parsing results of the entire sentence based on the learned lexicons."""
[docs] def format_summary(self): fmt = 'Learned lexicons:\n' for w, l in zip(self.words, self.lexicons): fmt += indent_text(w + ': ' + str(l)) + '\n' fmt += 'Parsing result:\n' for i, r in enumerate(self.parsing_results): fmt += f'{i}: ' + indent_text(str(r), indent_format=' ').lstrip() + '\n' return fmt
[docs] def print_summary(self): print(self.format_summary())
[docs] def by_parsing( ccg: CCG, sentence: Union[str, Iterable[str]], *, novel_words: Optional[tuple] = None, candidate_syntax_types: Optional[List[CCGSyntaxSearchResult]] = None, syntax_searcher: Optional[CCGSyntaxSearcherBase] = None, syntax_searcher_kwargs: Optional[dict] = None, candidate_semantics: Optional[List[CCGSemanticsSearchResult]] = None, semantics_searcher: Optional[CCGSemanticsSearcherBase] = None, semantics_searcher_kwargs: Optional[dict] = None, bind_concepts: bool = True ) -> List[CCGLearningResult]: """Learn CCG lexicon entries from a sentence by trying to parse the sentence. Args: ccg: the CCG grammar. sentence: the sentence to be parsed. novel_words: the list of novel words to be learned. If not specified, the algorithm will detect all novel words in the sentence. candidate_syntax_types: the list of candidate syntax types to be used for parsing. If not specified, the algorithm will use the syntax searcher to generate the candidate syntax types. syntax_searcher: the syntax searcher to be used for generating candidate syntax types. If not specified, the algorithm will use the enumerative searcher. syntax_searcher_kwargs: the keyword arguments for the syntax searcher. candidate_semantics: the list of candidate semantics to be used for parsing. If not specified, the algorithm will use the semantics searcher to generate the candidate semantics. semantics_searcher: the semantics searcher to be used for generating candidate semantics. If not specified, the algorithm will use the enumerative searcher. semantics_searcher_kwargs: the keyword arguments for the semantics searcher. bind_concepts: whether to bind concepts in the semantics. This will allow algorithm to invent novel concepts while learning. Returns: The result of the learning process, as a list of :class:`CCGLearningResult`. """ if isinstance(sentence, str): sentence = sentence.split() if candidate_syntax_types is None: if syntax_searcher is None: syntax_searcher = CCGSyntaxEnumerativeSearcher(ccg.syntax_system) if syntax_searcher_kwargs is None: syntax_searcher_kwargs = dict() candidate_syntax_types = syntax_searcher.gen(**syntax_searcher_kwargs) if candidate_semantics is None: if semantics_searcher is None: semantics_searcher = CCGSemanticsEnumerativeSearcher(ccg.function_domain) if semantics_searcher_kwargs is None: semantics_searcher_kwargs = dict() candidate_semantics = semantics_searcher.gen(**semantics_searcher_kwargs) semantics_search_results_arity = collections.defaultdict(list) for r in candidate_semantics: if not bind_concepts and r.nr_constant_arguments > 0: continue # if we do not allow binding new concepts, we skip those semantics with constant arguments. semantics_search_results_arity[r.nr_variable_arguments].append(r.semantics) if novel_words is None: novel_words = set() for word in sentence: if word not in ccg.lexicon_entries or len(ccg.lexicon_entries[word]) == 0: novel_words.add(word) novel_words = tuple(novel_words) new_ccg = ccg.clone() success_syntax_types = list() for syntax_types in itertools.product(candidate_syntax_types, repeat=len(novel_words)): for word, syntax in zip(novel_words, syntax_types): new_ccg.add_entry(word, Lexicon(syntax.syntax, None)) try: with CCGCompositionContext(semantics=False, exc_verbose=False).as_default(): parsing_results = new_ccg.parse(sentence) # TODO: maybe cache the results. if len(parsing_results) > 0: # TODO: add a flag to control this check. found_success = False for r in parsing_results: if r.syntax.typename in syntax_searcher._starting_symbols: found_success = True break if found_success: success_syntax_types.append(syntax_types) except CCGCompositionError: pass for word in novel_words: new_ccg.clear_entries(word) success_results = list() for syntax_types in success_syntax_types: for semantics_list in itertools.product(*[ semantics_search_results_arity[syntax.syntax.arity] for syntax in syntax_types ]): this_lexicons = list() for word, syntax, semantics in zip(novel_words, syntax_types, semantics_list): syntax, semantics = syntax.syntax, semantics if bind_concepts: semantics = _bind_concepts(semantics, word) lexicon = Lexicon(syntax, semantics) new_ccg.add_entry(word, lexicon) this_lexicons.append(lexicon) try: with CCGCompositionContext(exc_verbose=True).as_default(): parsing_results = new_ccg.parse(sentence) if len(parsing_results) > 0: success_results.append(CCGLearningResult(novel_words, tuple(this_lexicons), parsing_results)) except CCGCompositionError: pass for word in novel_words: new_ccg.clear_entries(word) return success_results
def _bind_concepts(semantics: CCGSemantics, word: str): if semantics.is_function: mapping = dict() for i, argument in enumerate(semantics.value.arguments): if isinstance(argument.dtype, ConstantType): mapping[f'#{i}'] = Value(argument.dtype, word) if len(mapping) > 0: semantics = CCGSemantics(semantics.value.partial(**mapping, execute_fully_bound_functions=True), is_conj=semantics.is_conj) return semantics
[docs] def by_parsing_with_lexicon_gen( ccg: CCG, sentence: Union[str, list, tuple], lexicon_generator: Callable[[str], Iterable[Lexicon]], novel_words: Optional[tuple] = None ): """Learn CCG lexicon entries from a sentence by trying to parse the sentence. Unlike :func:`by_parsing`, this function takes a lexicon generator instead of a list of candidate syntax types and semantics. Args: ccg: the CCG grammar. sentence: the sentence to be parsed. lexicon_generator: the lexicon generator to be used for generating candidate lexicons. It takes a word as input and returns a list of candidate lexicons. novel_words: the list of novel words to be learned. If not specified, the algorithm will detect all novel words in the sentence. Returns: The result of the learning process, as a list of :class:`CCGLearningResult`. """ if isinstance(sentence, str): sentence = sentence.split() if novel_words is None: novel_words = set() for word in sentence: if word not in ccg.lexicon_entries or len(ccg.lexicon_entries[word]) == 0: novel_words.add(word) novel_words = tuple(novel_words) new_ccg = ccg.clone() candidate_lexicons = [lexicon_generator(word) for word in novel_words] success_results = list() # TODO(Jiayuan Mao @ 07/11): optimize by first filtering syntax types. for new_lexicon in itertools.product(*candidate_lexicons): for word, entry in zip(novel_words, new_lexicon): new_ccg.add_entry(word, entry) try: with CCGCompositionContext(exc_verbose=False).as_default(): parsing_results = new_ccg.parse(sentence) if len(parsing_results) > 0: success_results.append(CCGLearningResult(novel_words, tuple(new_lexicon), parsing_results)) except CCGCompositionError: pass for word in novel_words: new_ccg.clear_entries(word) return success_results
[docs] def by_grounding( by_parsing_learning_func: Callable, ccg: CCG, sentence: Union[str, Iterable[str]], executor: Callable[[Union[Function, ConstantExpression, FunctionApplicationExpression]], Any], criterion: Callable[[Any], bool], **by_parsing_kwargs ) -> List[CCGLearningResult]: """Learn a CCG lexicon from a sentence by both parsing the sentence and grounding the parsing result with a given executor. Args: by_parsing_learning_func: the function to be used for learning a CCG lexicon from a sentence by parsing the sentence. Can be either :func:`by_parsing` or :func:`by_parsing_with_lexicon_gen`. ccg: the CCG grammar. sentence: the sentence to be parsed. executor: the executor used to ground the parsing result. criterion: the criterion used to classify whether the execution result is correct. **by_parsing_kwargs: the keyword arguments to be passed to the ``by_parsing_learning_func``. Returns: The result of the learning process, as a list of :class:`CCGLearningResult`. """ by_parsing_results = by_parsing_learning_func(ccg, sentence, **by_parsing_kwargs) success_results = list() for r in by_parsing_results: ccg_node = r.parsing_results[0] try: result = executor(ccg_node.expression.value) if criterion(result): success_results.append(r) except DSLExecutionError: pass return success_results
[docs] def auto_research_novel_words( base_learning_func: Callable, ccg: CCG, sentence: Union[str, Iterable[str]], max_research: int, **kwargs ) -> List[CCGLearningResult]: """A helper function that automatically perform re-search for known words in a sentence. Args: base_learning_func: the base learning function to be used for learning a CCG lexicon from a sentence. Can be :func:`by_parsing`, or :func:`by_parsing_with_lexicon_gen`, or :func:`by_grounding`. ccg: the CCG grammar. sentence: the sentence to be parsed. max_research: the maximum number of words whose lexicon will be re-searched. **kwargs: additional keyword arguments to be passed to the ``base_learning_func``. """ if isinstance(sentence, str): sentence = sentence.split() novel_words = set() known_words = set() for word in sentence: if word not in ccg.lexicon_entries or len(ccg.lexicon_entries[word]) == 0: novel_words.add(word) else: known_words.add(word) novel_words_tuple = tuple(novel_words) known_words_tuple = tuple(known_words) rv = base_learning_func(ccg, sentence, **kwargs) if len(rv) > 0: return rv kwargs = kwargs.copy() for nr_research in range(1, max_research + 1): rv_accumulate = list() for comb in itertools.combinations(known_words_tuple, nr_research): new_novel_words = novel_words_tuple + tuple(comb) kwargs['novel_words'] = new_novel_words rv = base_learning_func(ccg, sentence, **kwargs) rv_accumulate.extend(rv) if len(rv_accumulate) > 0: return rv_accumulate return list()