Skip to content

Huggingface

HuggingFaceEmbedding

Bases: AbstractEmbeddingModel

Class for embedding models using HuggingFace.

Source code in src/embedding/huggingface.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class HuggingFaceEmbedding(AbstractEmbeddingModel):
    """
    Class for embedding models using HuggingFace.
    """

    def __init__(self, name, model, split_camel: bool = False):
        super().__init__(split_camel=split_camel)
        self._name = f"{name}"
        do_lower_case = True
        self.model = BertModel.from_pretrained(model)
        self.tokenizer = BertTokenizer.from_pretrained(
            model, do_lower_case=do_lower_case
        )

    def get_embedding(self, text: str) -> np.ndarray:
        """
        Returns the embedding of the text.
        :param text:
        :return:
        """
        if self._split_camel:
            text = " ".join(self.split(text))
        input_ids = torch.tensor(self.tokenizer.encode(text)).unsqueeze(
            0
        )  # Batch size 1
        outputs = self.model(input_ids)
        last_hidden_states = outputs[
            0
        ]  # The last hidden-state is the first element of the output tuple
        return last_hidden_states.mean(1).detach().numpy()[0]

get_embedding(text)

Returns the embedding of the text.

Parameters:

Name Type Description Default
text str
required

Returns:

Type Description
ndarray
Source code in src/embedding/huggingface.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def get_embedding(self, text: str) -> np.ndarray:
    """
    Returns the embedding of the text.
    :param text:
    :return:
    """
    if self._split_camel:
        text = " ".join(self.split(text))
    input_ids = torch.tensor(self.tokenizer.encode(text)).unsqueeze(
        0
    )  # Batch size 1
    outputs = self.model(input_ids)
    last_hidden_states = outputs[
        0
    ]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states.mean(1).detach().numpy()[0]

SentenceTransformersEmbedding

Bases: AbstractEmbeddingModel

Source code in src/embedding/huggingface.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class SentenceTransformersEmbedding(AbstractEmbeddingModel):
    def __init__(self, name, model, device="cpu", split_camel: bool = False):
        super().__init__(split_camel=split_camel)
        self._name = f"{name}"
        self.model = SentenceTransformer(model, device=device)
        self.model.tokenizer.add_special_tokens({"pad_token": "[PAD]"})

    def get_embedding(self, text: str) -> np.ndarray:
        """
        Returns the embedding of the text.
        :param text:
        :return:
        """
        if self._split_camel:
            text = " ".join(self.split(text))
        embeddings = self.model.encode(text)
        return embeddings

get_embedding(text)

Returns the embedding of the text.

Parameters:

Name Type Description Default
text str
required

Returns:

Type Description
ndarray
Source code in src/embedding/huggingface.py
48
49
50
51
52
53
54
55
56
57
def get_embedding(self, text: str) -> np.ndarray:
    """
    Returns the embedding of the text.
    :param text:
    :return:
    """
    if self._split_camel:
        text = " ".join(self.split(text))
    embeddings = self.model.encode(text)
    return embeddings