Skip to content

Keyword

KeywordLF

Bases: LFBase

Labelling function that uses identifiers and keywords.

Source code in src/annotation/keyword.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class KeywordLF(LFBase):
    """
    Labelling function that uses identifiers and keywords.
    """

    def __init__(self, taxonomy: TaxonomyBase, key: str):
        super().__init__(taxonomy)
        self.key = key

    def annotate(self, name: str, content: str) -> np.array:
        """
        Compute the probability for the file given the name and/or the content.

        :param name: Source file name
        :param content: Content of the file (usually identifiers)
        :return:
        """
        node_labels = np.zeros(len(self.taxonomy))
        for _label in self.taxonomy:
            label: KeywordLabel = _label
            intersection = list(label.keywords.intersection(Multiset(content.split())))
            intersection = Counter(intersection)
            node_labels[label.index] = sum([intersection[k] * label.weights[k]
                                            for k in intersection.keys()])

        norm = np.sum(node_labels)
        node_vec = node_labels / norm if norm > 0 else np.zeros(len(self.taxonomy))

        return node_vec

annotate(name, content)

Compute the probability for the file given the name and/or the content.

Parameters:

Name Type Description Default
name str

Source file name

required
content str

Content of the file (usually identifiers)

required

Returns:

Type Description
array
Source code in src/annotation/keyword.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def annotate(self, name: str, content: str) -> np.array:
    """
    Compute the probability for the file given the name and/or the content.

    :param name: Source file name
    :param content: Content of the file (usually identifiers)
    :return:
    """
    node_labels = np.zeros(len(self.taxonomy))
    for _label in self.taxonomy:
        label: KeywordLabel = _label
        intersection = list(label.keywords.intersection(Multiset(content.split())))
        intersection = Counter(intersection)
        node_labels[label.index] = sum([intersection[k] * label.weights[k]
                                        for k in intersection.keys()])

    norm = np.sum(node_labels)
    node_vec = node_labels / norm if norm > 0 else np.zeros(len(self.taxonomy))

    return node_vec