LlamaCpp

Neil HaddleyAugust 3, 2023

Accessing the llama.cpp model from Python

AIllama-cpplocal-llmllamapython

The llama.cpp project provides Large Language Models.

The llama-cpp-python module allowed me to access the llama.cpp model from Python.

I tested with "What is the Capital of England?" and reviewed the first response. I ran streamlit run app1a.py

I tested with "What is the Capital of England?" and reviewed the first response. I ran streamlit run app1a.py

What is the Capital of England? Second response.

What is the Capital of England? Second response.

Requirements

BASH
1$ conda create --name llama jupyterlab ipykernel ipywidgets
2$ conda activate llama
3$ pip install -r requirements.txt

LangChain

I updated my LangChain to create a medical report application to work with Llama.cpp

I ran streamlit run app2a.py

I ran streamlit run app2a.py

Pirate Jack

I updated a Llama2 Chat code sample to create a "Pirate Jack" application.

I tested with "If I have 17 Doubloons and spend 6 on a Pistol how many would I have left?" (using M2 Apple Silicon GPU). I ran streamlit run app3a.py

I tested with "If I have 17 Doubloons and spend 6 on a Pistol how many would I have left?" (using M2 Apple Silicon GPU). I ran streamlit run app3a.py

How much does a Cutlass cost? (from "memory"... it be costin' 3 Doubloons, savvy?)

How much does a Cutlass cost? (from "memory"... it be costin' 3 Doubloons, savvy?)

Chat application's memory

Chat application's memory

requirements.txt

TEXT
1streamlit
2langchain
3openai
4tiktoken
5streamlit
6llama-cpp-python
7pypdf
8torch
9InstructorEmbedding
10sentence_transformers
11chromadb

app1a.py

PYTHON
1# from langchain.llms import OpenAI
2from langchain.llms import  LlamaCpp
3import streamlit
4
5# llm = OpenAI(temperature=0.9)
6llm = LlamaCpp(
7    model_path="llama-2-7b-chat.ggmlv3.q4_0.bin",
8    verbose=True,
9)
10
11prompt = streamlit.text_input('Input your prompt')
12
13if prompt:
14    response = llm(prompt)
15    streamlit.write(response)

hp4.ipynb

PYTHON
1from langchain.document_loaders import PyPDFLoader
2from langchain.text_splitter import RecursiveCharacterTextSplitter
3from langchain.vectorstores import Chroma
4
5loader = PyPDFLoader('hp4.pdf')
6pages = loader.load_and_split()
7
8# Define chunk size, overlap and separators
9
10text_splitter = RecursiveCharacterTextSplitter(
11    # Set a really small chunk size, just to show.
12    chunk_size = 1024,
13    chunk_overlap  = 40,
14    length_function = len,
15    separators=["\n \n", " ", ""]
16)
17
18# split the pages into paragraphs as defined above
19
20paragraphs = text_splitter.split_documents(pages)
21
22---
23
24# save OpenAIEmbeddings to "Chroma" directory
25
26from langchain.embeddings import OpenAIEmbeddings
27
28embeddings=OpenAIEmbeddings()
29
30save_directory = "Chroma"
31
32store = Chroma.from_documents(paragraphs, embeddings, collection_name='hp4', persist_directory=save_directory)
33store.persist()
34
35# search for similar paragraphs
36search = store.similarity_search_with_score('Does the patient smoke?')
37
38print(search)
39
40---
41
42# save HuggingFaceInstructEmbeddings to "Chroma2" directory
43
44from langchain.embeddings import HuggingFaceInstructEmbeddings
45
46embeddings2 = HuggingFaceInstructEmbeddings()
47
48save_directory2 = "Chroma2"
49
50store2 = Chroma.from_documents(paragraphs, embeddings2, collection_name='hp4', persist_directory=save_directory2)
51store2.persist()
52
53# search for similar paragraphs
54search2 = store2.similarity_search_with_score('Does the patient smoke?')
55
56print(search2)

app2a.py

PYTHON
1import streamlit as st
2
3# from langchain.llms import OpenAI
4from langchain.llms import  LlamaCpp
5
6#from langchain.embeddings import OpenAIEmbeddings
7from langchain.embeddings import HuggingFaceInstructEmbeddings
8
9
10from langchain.vectorstores import Chroma
11
12from langchain.agents.agent_toolkits import (
13    create_vectorstore_agent,
14    VectorStoreToolkit,
15    VectorStoreInfo
16)
17
18from langchain.vectorstores import Chroma
19
20from langchain.agents.agent_toolkits import (
21    create_vectorstore_agent,
22    VectorStoreToolkit,
23    VectorStoreInfo
24)
25
26#embeddings=OpenAIEmbeddings()
27embeddings = HuggingFaceInstructEmbeddings()
28
29# llm = OpenAI(temperature=0.9,verbose=True)
30
31# https://python.langchain.com/docs/integrations/llms/llamacpp
32# https://github.com/langchain-ai/langchain/issues/8004
33# https://github.com/abetlen/llama-cpp-python/blob/main/docs/install/macos.md
34
35llm = LlamaCpp(
36    model_path="llama-2-7b-chat.ggmlv3.q4_0.bin",
37    verbose=True,
38    temperature=1,
39    max_tokens=2048,  # 256
40    n_gpu_layers=1,
41    n_batch=512,
42    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
43    n_ctx=10240 # Context Length
44)
45
46load_directory = "Chroma2"
47
48# load embeddings from "Chroma" directory
49db = Chroma(persist_directory=load_directory,collection_name='hp4',embedding_function=embeddings)
50
51vectorstore_info = VectorStoreInfo(
52    name="hp4",
53    description="embeddings generated from the pdf document",
54    vectorstore=db
55)
56
57toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info)
58
59agent_executor = create_vectorstore_agent(
60    llm=llm,
61    toolkit=toolkit,
62    verbose=True
63)
64
65prompt = st.text_input('Input your prompt')
66
67if prompt:
68    response = agent_executor.run(prompt)
69    st.write(response)
70
71    with st.expander('Document Similarity Search'):
72        search = db.similarity_search_with_score(prompt)
73        st.write(search)

app3a.py

PYTHON
1#!pip install streamlit
2#!pip install llama-cpp-python
3#!pip install watchdog
4
5from dotenv import load_dotenv, find_dotenv
6from langchain.callbacks import get_openai_callback
7from langchain.chat_models import ChatOpenAI
8from langchain.schema import SystemMessage, HumanMessage, AIMessage
9from langchain.llms import LlamaCpp
10from langchain.callbacks.manager import CallbackManager
11from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
12import streamlit as st
13
14from typing import Union
15from typing import List
16
17
18def init_page() -> None:
19    st.set_page_config(page_title="Pirate Jack")
20    st.header("Pirate Jack")
21    st.sidebar.title("Options")
22
23
24def init_messages() -> None:
25    clear_button = st.sidebar.button("Clear Conversation", key="clear")
26    if clear_button or "messages" not in st.session_state:
27        st.session_state.messages = [
28            SystemMessage(
29                # content="You are a helpful AI assistant. Reply your answer in mardkown format.")
30                content="You are now Pirate Jack. Always talk like a pirate. Let's think step by step."
31            )
32        ]
33        st.session_state.costs = []
34
35
36def select_llm() -> Union[ChatOpenAI, LlamaCpp]:
37    model_name = st.sidebar.radio(
38        "Choose LLM:",
39        (
40            "llama-2-7b-chat.ggmlv3.q2_K",
41            "codeup-llama-2-13b-chat-hf.ggmlv3.q2_K",
42            "stablebeluga-13b.ggmlv3.q4_K_S",
43            "gpt-3.5-turbo-0613",
44            "gpt-4",
45        ),
46    )
47
48    temperature = st.sidebar.slider(
49        "Temperature:", min_value=0.0, max_value=1.0, value=0.0, step=0.01
50    )
51    if model_name.startswith("gpt-"):
52        return ChatOpenAI(temperature=temperature, model_name=model_name)
53    elif (
54        model_name.startswith("llama-2-")
55        or model_name.startswith("codeup-llama-2-")
56        or model_name.startswith("stablebeluga-")
57    ):
58        callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
59
60        if model_name.startswith("llama-2-"):
61            return LlamaCpp(
62                model_path=f"./{model_name}.bin",
63                callback_manager=callback_manager,
64                verbose=True,  # False,  # True
65                temperature=temperature,
66                max_tokens=2048,  # 256
67                n_ctx=1024,  # Context Length
68                n_gpu_layers = 1,  # Metal set to 1 is enough.
69                n_batch = 4,  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
70                f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
71            )
72        else:
73            return LlamaCpp(
74                model_path=f"./{model_name}.bin",
75                callback_manager=callback_manager,
76                verbose=True,  # False,  # True
77                temperature=temperature,
78                max_tokens=2048,  # 256
79            )
80
81
82
83def get_answer(llm, messages) -> tuple[str, float]:
84    if isinstance(llm, ChatOpenAI):
85        with get_openai_callback() as cb:
86            answer = llm(messages)
87        return answer.content, cb.total_cost
88    if isinstance(llm, LlamaCpp):
89        answer = llm(llama_v2_prompt(convert_langchainschema_to_dict(messages)))
90        return (answer, 0.0)
91
92
93def find_role(message: Union[SystemMessage, HumanMessage, AIMessage]) -> str:
94    """
95    Identify role name from langchain.schema object.
96    """
97    if isinstance(message, SystemMessage):
98        return "system"
99    if isinstance(message, HumanMessage):
100        return "user"
101    if isinstance(message, AIMessage):
102        return "assistant"
103    raise TypeError("Unknown message type.")
104
105
106def convert_langchainschema_to_dict(
107    messages: List[Union[SystemMessage, HumanMessage, AIMessage]]
108) -> List[dict]:
109    """
110    Convert the chain of chat messages in list of langchain.schema format to
111    list of dictionary format.
112    """
113    return [
114        {"role": find_role(message), "content": message.content} for message in messages
115    ]
116
117
118def llama_v2_prompt(messages: List[dict]) -> str:
119    """
120    Convert the messages in list of dictionary format to Llama2 compliant format.
121    """
122    B_INST, E_INST = "[INST]", "[/INST]"
123    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
124    BOS, EOS = "<s>", "</s>"
125    DEFAULT_SYSTEM_PROMPT = f"""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
126
127    if messages[0]["role"] != "system":
128        messages = [
129            {
130                "role": "system",
131                "content": DEFAULT_SYSTEM_PROMPT,
132            }
133        ] + messages
134    messages = [
135        {
136            "role": messages[1]["role"],
137            "content": B_SYS + messages[0]["content"] + E_SYS + messages[1]["content"],
138        }
139    ] + messages[2:]
140
141    messages_list = [
142        f"{BOS}{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} {EOS}"
143        for prompt, answer in zip(messages[::2], messages[1::2])
144    ]
145    messages_list.append(
146        f"{BOS}{B_INST} {(messages[-1]['content']).strip()} {E_INST} {EOS}"
147    )
148
149    result = "".join(messages_list)
150    print(result)
151
152    return result
153
154
155def main() -> None:
156    _ = load_dotenv(find_dotenv())
157
158    init_page()
159    llm = select_llm()
160    init_messages()
161
162    # Supervise user input
163    if user_input := st.chat_input("Input your question!"):
164        st.session_state.messages.append(HumanMessage(content=user_input))
165        with st.spinner("Pirate Jack be thinking ..."):
166            result = get_answer(llm, st.session_state.messages)
167            if result == None:
168                st.session_state.messages.append(
169                    AIMessage(content="Sorry, I don't know the answer.")
170                )
171            else:
172                answer, cost = result
173                st.session_state.messages.append(AIMessage(content=answer))
174                st.session_state.costs.append(cost)
175
176    # Display chat history
177    messages = st.session_state.get("messages", [])
178    for message in messages:
179        if isinstance(message, AIMessage):
180            with st.chat_message("assistant"):
181                st.markdown(message.content)
182        elif isinstance(message, HumanMessage):
183            with st.chat_message("user"):
184                st.markdown(message.content)
185
186    costs = st.session_state.get("costs", [])
187    st.sidebar.markdown("## Costs")
188    st.sidebar.markdown(f"**Total cost: ${sum(costs):.5f}**")
189    for cost in costs:
190        st.sidebar.markdown(f"- ${cost:.5f}")
191
192
193# streamlit run app.py
194if __name__ == "__main__":
195    main()