Skip to content

Huggingface

HuggingFaceEmbedding

Bases: AbstractEmbeddingModel

Class for embedding models using HuggingFace.

Source code in src/embedding/huggingface.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class HuggingFaceEmbedding(AbstractEmbeddingModel):
    """
    Class for embedding models using HuggingFace.
    """

    def __init__(self, name, model, split_camel: bool = False):
        super().__init__(split_camel=split_camel)
        self._name = f'{name}'
        do_lower_case = True
        self.model = BertModel.from_pretrained(model)
        self.tokenizer = BertTokenizer.from_pretrained(model, do_lower_case=do_lower_case)

    def get_embedding(self, text: str) -> np.ndarray:
        """
        Returns the embedding of the text.
        :param text:
        :return:
        """
        if self._split_camel:
            text = ' '.join(self.split(text))
        input_ids = torch.tensor(self.tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
        outputs = self.model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        return last_hidden_states.mean(1).detach().numpy()[0]

get_embedding(text)

Returns the embedding of the text.

Parameters:

Name Type Description Default
text str
required

Returns:

Type Description
ndarray
Source code in src/embedding/huggingface.py
21
22
23
24
25
26
27
28
29
30
31
32
def get_embedding(self, text: str) -> np.ndarray:
    """
    Returns the embedding of the text.
    :param text:
    :return:
    """
    if self._split_camel:
        text = ' '.join(self.split(text))
    input_ids = torch.tensor(self.tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    outputs = self.model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states.mean(1).detach().numpy()[0]

SentenceTransformersEmbedding

Bases: AbstractEmbeddingModel

Source code in src/embedding/huggingface.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
class SentenceTransformersEmbedding(AbstractEmbeddingModel):
    def __init__(self, name, model, device='cpu', split_camel: bool = False):
        super().__init__(split_camel=split_camel)
        self._name = f'{name}'
        self.model = SentenceTransformer(model, device=device)
        self.model.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    def get_embedding(self, text: str) -> np.ndarray:
        """
        Returns the embedding of the text.
        :param text:
        :return:
        """
        if self._split_camel:
            text = ' '.join(self.split(text))
        embeddings = self.model.encode(text)
        return embeddings

get_embedding(text)

Returns the embedding of the text.

Parameters:

Name Type Description Default
text str
required

Returns:

Type Description
ndarray
Source code in src/embedding/huggingface.py
42
43
44
45
46
47
48
49
50
51
def get_embedding(self, text: str) -> np.ndarray:
    """
    Returns the embedding of the text.
    :param text:
    :return:
    """
    if self._split_camel:
        text = ' '.join(self.split(text))
    embeddings = self.model.encode(text)
    return embeddings