llm_in_science_workshop/demo_HPC_rag.py at main · ryanrwatkins/llm_in_science_workshop · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#In Cerberus HPC > VS Code > Terminal
#ml python3/3.12.9
#ml ollama/0.6.8
#ollama serve &
#python3 -m venv ollama_venv  #(could create new one, or use previous)
#source ollama_venv/bin/activate
#pip install requests numpy ollama


import numpy as np
import ollama
import os
import re
import time
import json
from numpy.linalg import norm

import re

def parse_file(filename, words_per_chunk=100):
    with open(filename, encoding="utf-8-sig") as f:
        text = f.read()

    # Remove Gutenberg header/footer
    start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"
    start = text.find(start_marker)
    end = text.find(end_marker)
    if start != -1 and end != -1:
        text = text[start + len(start_marker):end]

    # Clean up and split
    words = text.split()
    chunks = []
    for i in range(0, len(words), words_per_chunk):
        chunk = " ".join(words[i:i + words_per_chunk])
        chunks.append(chunk)
    return chunks


def save_embeddings(filename, embeddings):
    # create dir if it doesnt exist
    if not os.path.exists("embeddings"):
        os.makedirs("embeddings")
    # dump embeddings to json
    with open(f"embeddings/{filename}.json", "w") as f:
        json.dump(embeddings, f)
    print("save embeddings")

def load_embeddings(filename):
    # check if file exists
    if not os.path.exists(f"embeddings/{filename}.json"):
        return False
    # load embeddings from json
    with open (f"embeddings/{filename}.json", "r") as f:
        return json.load(f)

def get_embeddings(filename, modelname, chunks):
    # check if embeddings are already saved
    if (embeddings := load_embeddings(filename)) is not False:
        return embeddings
    # get embeddings from ollama
    embeddings = [
        ollama.embeddings(model=modelname, prompt=chunk)['embedding']
        for chunk in chunks
    ]
    # save embeddings
    save_embeddings(filename, embeddings)
    print("saved embeddings")
    return embeddings

def find_similar(needle, haystack):
    needle_norm = norm(needle)
    similarity_scores = [
        np.dot(needle, item) / (needle_norm * norm(item)) for item in haystack
    ]
    print("Found similar")
    return sorted(zip(similarity_scores, range(len(haystack))), reverse=True)

def main():
    SYSTEM_PROMPT = """You are a helpful reading assistant who answers questions
                        based on the snippets of text provided in context. Answer
                        only using the context provided, being as concise as possible and quoting from the context when useful.
                        If you are unsure, just say you don't know.
                        Context:
                        """

    filename = "book.txt"
    paragraphs = parse_file("book.txt", words_per_chunk=100)


    embeddings = get_embeddings(filename, 'mxbai-embed-large', paragraphs)

    prompt = input("What do you want?")
    prompt_embedding = ollama.embeddings(model='mxbai-embed-large', prompt=prompt)[
        "embedding"
        ]
    # most similar results
    most_similar_chunks = find_similar(prompt_embedding, embeddings)[:5]
    print("Top similar chunks:")
    for score, idx in most_similar_chunks:
        print(f"{score:.4f} - {idx} - {paragraphs[idx][:100]!r}")

    context = SYSTEM_PROMPT + "\n" + "\n---\n".join(paragraphs[item[1]] for item in most_similar_chunks)


    response = ollama.chat(
        model='llama3.1:8b',

        messages=[
            {
                "role": "system",
                #"content": SYSTEM_PROMPT +"\n".join(paragraphs[item[1]] for item in most_similar_chunks)
                "content": context
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    print(response["message"]["content"])


if __name__ == "__main__":
    main()