LlamaCpp

Neil HaddleyAugust 3, 2023

Accessing the llama.cpp model from Python

The llama.cpp project provides Large Language Models.

The llama-cpp-python module allowed me to access the llama.cpp model from Python.

What is the Capital of England? First response.streamlit run app1a.py

What is the Capital of England? First response.streamlit run app1a.py

What is the Capital of England? Second response.

What is the Capital of England? Second response.

Requirements

$ conda create --name llama jupyterlab ipykernel ipywidgets

$ conda activate llama

$ pip install -r requirements.txt

LangChain

I updated my LangChain to create a medical report application to work with Llama.cpp

streamlit run app2a.py

streamlit run app2a.py

Pirate Jack

I updated a Llama2 Chat code sample to create a "Pirate Jack" application.

If I have 17 Doubloons and spend 6 on a Pistol how many would I have have left? (using M2 apple silicon GPU)streamlit run app3a.py

If I have 17 Doubloons and spend 6 on a Pistol how many would I have have left? (using M2 apple silicon GPU)streamlit run app3a.py

How much does a Cutlass cost? (from "memory"... it be costin' 3 Doubloons, savvy?)

How much does a Cutlass cost? (from "memory"... it be costin' 3 Doubloons, savvy?)

Chat application's memory

Chat application's memory

requirements.txt

TEXT
1streamlit
2langchain
3openai
4tiktoken
5streamlit
6llama-cpp-python
7pypdf
8torch
9InstructorEmbedding
10sentence_transformers
11chromadb

app1a.py

TEXT
1# from langchain.llms import OpenAI
2from langchain.llms import  LlamaCpp
3import streamlit
4
5# llm = OpenAI(temperature=0.9)
6llm = LlamaCpp(
7    model_path="llama-2-7b-chat.ggmlv3.q4_0.bin",
8    verbose=True,
9)
10
11prompt = streamlit.text_input('Input your prompt')
12
13if prompt:
14    response = llm(prompt)
15    streamlit.write(response)

hp4.ipynb

TEXT
1from langchain.document_loaders import PyPDFLoader
2from langchain.text_splitter import RecursiveCharacterTextSplitter
3from langchain.vectorstores import Chroma
4
5loader = PyPDFLoader('hp4.pdf')
6pages = loader.load_and_split()
7
8# Define chunk size, overlap and separators
9
10text_splitter = RecursiveCharacterTextSplitter(
11    # Set a really small chunk size, just to show.
12    chunk_size = 1024,
13    chunk_overlap  = 40,
14    length_function = len,
15    separators=["\n \n", " ", ""]
16)
17
18# split the pages into paragraphs as defined above
19
20paragraphs = text_splitter.split_documents(pages)
21
22---
23
24# save OpenAIEmbeddings to "Chroma" directory
25
26from langchain.embeddings import OpenAIEmbeddings
27
28embeddings=OpenAIEmbeddings()
29
30save_directory = "Chroma"
31
32store = Chroma.from_documents(paragraphs, embeddings, collection_name='hp4', persist_directory=save_directory)
33store.persist()
34
35# search for similar paragraphs
36search = store.similarity_search_with_score('Does the patient smoke?')
37
38print(search)
39
40---
41
42# save HuggingFaceInstructEmbeddings to "Chroma2" directory
43
44from langchain.embeddings import HuggingFaceInstructEmbeddings
45
46embeddings2 = HuggingFaceInstructEmbeddings()
47
48save_directory2 = "Chroma2"
49
50store2 = Chroma.from_documents(paragraphs, embeddings2, collection_name='hp4', persist_directory=save_directory2)
51store2.persist()
52
53# search for similar paragraphs
54search2 = store2.similarity_search_with_score('Does the patient smoke?')
55
56print(search2)

app2a.py

TEXT
1import streamlit as st
2
3# from langchain.llms import OpenAI
4from langchain.llms import  LlamaCpp
5
6#from langchain.embeddings import OpenAIEmbeddings
7from langchain.embeddings import HuggingFaceInstructEmbeddings
8
9
10from langchain.vectorstores import Chroma
11
12from langchain.agents.agent_toolkits import (
13    create_vectorstore_agent,
14    VectorStoreToolkit,
15    VectorStoreInfo
16)
17
18from langchain.vectorstores import Chroma
19
20from langchain.agents.agent_toolkits import (
21    create_vectorstore_agent,
22    VectorStoreToolkit,
23    VectorStoreInfo
24)
25
26#embeddings=OpenAIEmbeddings()
27embeddings = HuggingFaceInstructEmbeddings()
28
29# llm = OpenAI(temperature=0.9,verbose=True)
30
31# https://python.langchain.com/docs/integrations/llms/llamacpp
32# https://github.com/langchain-ai/langchain/issues/8004
33# https://github.com/abetlen/llama-cpp-python/blob/main/docs/install/macos.md
34
35llm = LlamaCpp(
36    model_path="llama-2-7b-chat.ggmlv3.q4_0.bin",
37    verbose=True,
38    temperature=1,
39    max_tokens=2048,  # 256
40    n_gpu_layers=1,
41    n_batch=512,
42    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
43    n_ctx=10240 # Context Length
44)
45
46load_directory = "Chroma2"
47
48# load embeddings from "Chroma" directory
49db = Chroma(persist_directory=load_directory,collection_name='hp4',embedding_function=embeddings)
50
51vectorstore_info = VectorStoreInfo(
52    name="hp4",
53    description="embeddings generated from the pdf document",
54    vectorstore=db
55)
56
57toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info)
58
59agent_executor = create_vectorstore_agent(
60    llm=llm,
61    toolkit=toolkit,
62    verbose=True
63)
64
65prompt = st.text_input('Input your prompt')
66
67if prompt:
68    response = agent_executor.run(prompt)
69    st.write(response)
70
71    with st.expander('Document Similarity Search'):
72        search = db.similarity_search_with_score(prompt)
73        st.write(search)

app3a.py

TEXT
1#!pip install streamlit
2#!pip install llama-cpp-python
3#!pip install watchdog
4
5from dotenv import load_dotenv, find_dotenv
6from langchain.callbacks import get_openai_callback
7from langchain.chat_models import ChatOpenAI
8from langchain.schema import SystemMessage, HumanMessage, AIMessage
9from langchain.llms import LlamaCpp
10from langchain.callbacks.manager import CallbackManager
11from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
12import streamlit as st
13
14from typing import Union
15from typing import List
16
17
18def init_page() -> None:
19    st.set_page_config(page_title="Pirate Jack")
20    st.header("Pirate Jack")
21    st.sidebar.title("Options")
22
23
24def init_messages() -> None:
25    clear_button = st.sidebar.button("Clear Conversation", key="clear")
26    if clear_button or "messages" not in st.session_state:
27        st.session_state.messages = [
28            SystemMessage(
29                # content="You are a helpful AI assistant. Reply your answer in mardkown format.")
30                content="You are now Pirate Jack. Always talk like a pirate. Let's think step by step."
31            )
32        ]
33        st.session_state.costs = []
34
35
36def select_llm() -> Union[ChatOpenAI, LlamaCpp]:
37    model_name = st.sidebar.radio(
38        "Choose LLM:",
39        (
40            "llama-2-7b-chat.ggmlv3.q2_K",
41            "codeup-llama-2-13b-chat-hf.ggmlv3.q2_K",
42            "stablebeluga-13b.ggmlv3.q4_K_S",
43            "gpt-3.5-turbo-0613",
44            "gpt-4",
45        ),
46    )
47
48    temperature = st.sidebar.slider(
49        "Temperature:", min_value=0.0, max_value=1.0, value=0.0, step=0.01
50    )
51    if model_name.startswith("gpt-"):
52        return ChatOpenAI(temperature=temperature, model_name=model_name)
53    elif (
54        model_name.startswith("llama-2-")
55        or model_name.startswith("codeup-llama-2-")
56        or model_name.startswith("stablebeluga-")
57    ):
58        callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
59
60        if model_name.startswith("llama-2-"):
61            return LlamaCpp(
62                model_path=f"./{model_name}.bin",
63                callback_manager=callback_manager,
64                verbose=True,  # False,  # True
65                temperature=temperature,
66                max_tokens=2048,  # 256
67                n_ctx=1024,  # Context Length
68                n_gpu_layers = 1,  # Metal set to 1 is enough.
69                n_batch = 4,  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
70                f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
71            )
72        else:
73            return LlamaCpp(
74                model_path=f"./{model_name}.bin",
75                callback_manager=callback_manager,
76                verbose=True,  # False,  # True
77                temperature=temperature,
78                max_tokens=2048,  # 256
79            )
80
81
82
83def get_answer(llm, messages) -> tuple[str, float]:
84    if isinstance(llm, ChatOpenAI):
85        with get_openai_callback() as cb:
86            answer = llm(messages)
87        return answer.content, cb.total_cost
88    if isinstance(llm, LlamaCpp):
89        answer = llm(llama_v2_prompt(convert_langchainschema_to_dict(messages)))
90        return (answer, 0.0)
91
92
93def find_role(message: Union[SystemMessage, HumanMessage, AIMessage]) -> str:
94    """
95    Identify role name from langchain.schema object.
96    """
97    if isinstance(message, SystemMessage):
98        return "system"
99    if isinstance(message, HumanMessage):
100        return "user"
101    if isinstance(message, AIMessage):
102        return "assistant"
103    raise TypeError("Unknown message type.")
104
105
106def convert_langchainschema_to_dict(
107    messages: List[Union[SystemMessage, HumanMessage, AIMessage]]
108) -> List[dict]:
109    """
110    Convert the chain of chat messages in list of langchain.schema format to
111    list of dictionary format.
112    """
113    return [
114        {"role": find_role(message), "content": message.content} for message in messages
115    ]
116
117
118def llama_v2_prompt(messages: List[dict]) -> str:
119    """
120    Convert the messages in list of dictionary format to Llama2 compliant format.
121    """
122    B_INST, E_INST = "[INST]", "[/INST]"
123    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
124    BOS, EOS = "<s>", "</s>"
125    DEFAULT_SYSTEM_PROMPT = f"""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
126
127    if messages[0]["role"] != "system":
128        messages = [
129            {
130                "role": "system",
131                "content": DEFAULT_SYSTEM_PROMPT,
132            }
133        ] + messages
134    messages = [
135        {
136            "role": messages[1]["role"],
137            "content": B_SYS + messages[0]["content"] + E_SYS + messages[1]["content"],
138        }
139    ] + messages[2:]
140
141    messages_list = [
142        f"{BOS}{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} {EOS}"
143        for prompt, answer in zip(messages[::2], messages[1::2])
144    ]
145    messages_list.append(
146        f"{BOS}{B_INST} {(messages[-1]['content']).strip()} {E_INST} {EOS}"
147    )
148
149    result = "".join(messages_list)
150    print(result)
151
152    return result
153
154
155def main() -> None:
156    _ = load_dotenv(find_dotenv())
157
158    init_page()
159    llm = select_llm()
160    init_messages()
161
162    # Supervise user input
163    if user_input := st.chat_input("Input your question!"):
164        st.session_state.messages.append(HumanMessage(content=user_input))
165        with st.spinner("Pirate Jack be thinking ..."):
166            result = get_answer(llm, st.session_state.messages)
167            if result == None:
168                st.session_state.messages.append(
169                    AIMessage(content="Sorry, I don't know the answer.")
170                )
171            else:
172                answer, cost = result
173                st.session_state.messages.append(AIMessage(content=answer))
174                st.session_state.costs.append(cost)
175
176    # Display chat history
177    messages = st.session_state.get("messages", [])
178    for message in messages:
179        if isinstance(message, AIMessage):
180            with st.chat_message("assistant"):
181                st.markdown(message.content)
182        elif isinstance(message, HumanMessage):
183            with st.chat_message("user"):
184                st.markdown(message.content)
185
186    costs = st.session_state.get("costs", [])
187    st.sidebar.markdown("## Costs")
188    st.sidebar.markdown(f"**Total cost: ${sum(costs):.5f}**")
189    for cost in costs:
190        st.sidebar.markdown(f"- ${cost:.5f}")
191
192
193# streamlit run app.py
194if __name__ == "__main__":
195    main()