LlamaCpp
Neil Haddley • August 3, 2023
Accessing the llama.cpp model from Python
The llama.cpp project provides Large Language Models.
The llama-cpp-python module allowed me to access the llama.cpp model from Python.

What is the Capital of England? First response.streamlit run app1a.py

What is the Capital of England? Second response.
Requirements
$ conda create --name llama jupyterlab ipykernel ipywidgets
$ conda activate llama
$ pip install -r requirements.txt
LangChain
I updated my LangChain to create a medical report application to work with Llama.cpp

streamlit run app2a.py
Pirate Jack
I updated a Llama2 Chat code sample to create a "Pirate Jack" application.

If I have 17 Doubloons and spend 6 on a Pistol how many would I have have left? (using M2 apple silicon GPU)streamlit run app3a.py

How much does a Cutlass cost? (from "memory"... it be costin' 3 Doubloons, savvy?)

Chat application's memory
requirements.txt
TEXT
1streamlit 2langchain 3openai 4tiktoken 5streamlit 6llama-cpp-python 7pypdf 8torch 9InstructorEmbedding 10sentence_transformers 11chromadb
app1a.py
TEXT
1# from langchain.llms import OpenAI 2from langchain.llms import LlamaCpp 3import streamlit 4 5# llm = OpenAI(temperature=0.9) 6llm = LlamaCpp( 7 model_path="llama-2-7b-chat.ggmlv3.q4_0.bin", 8 verbose=True, 9) 10 11prompt = streamlit.text_input('Input your prompt') 12 13if prompt: 14 response = llm(prompt) 15 streamlit.write(response)
hp4.ipynb
TEXT
1from langchain.document_loaders import PyPDFLoader 2from langchain.text_splitter import RecursiveCharacterTextSplitter 3from langchain.vectorstores import Chroma 4 5loader = PyPDFLoader('hp4.pdf') 6pages = loader.load_and_split() 7 8# Define chunk size, overlap and separators 9 10text_splitter = RecursiveCharacterTextSplitter( 11 # Set a really small chunk size, just to show. 12 chunk_size = 1024, 13 chunk_overlap = 40, 14 length_function = len, 15 separators=["\n \n", " ", ""] 16) 17 18# split the pages into paragraphs as defined above 19 20paragraphs = text_splitter.split_documents(pages) 21 22--- 23 24# save OpenAIEmbeddings to "Chroma" directory 25 26from langchain.embeddings import OpenAIEmbeddings 27 28embeddings=OpenAIEmbeddings() 29 30save_directory = "Chroma" 31 32store = Chroma.from_documents(paragraphs, embeddings, collection_name='hp4', persist_directory=save_directory) 33store.persist() 34 35# search for similar paragraphs 36search = store.similarity_search_with_score('Does the patient smoke?') 37 38print(search) 39 40--- 41 42# save HuggingFaceInstructEmbeddings to "Chroma2" directory 43 44from langchain.embeddings import HuggingFaceInstructEmbeddings 45 46embeddings2 = HuggingFaceInstructEmbeddings() 47 48save_directory2 = "Chroma2" 49 50store2 = Chroma.from_documents(paragraphs, embeddings2, collection_name='hp4', persist_directory=save_directory2) 51store2.persist() 52 53# search for similar paragraphs 54search2 = store2.similarity_search_with_score('Does the patient smoke?') 55 56print(search2)
app2a.py
TEXT
1import streamlit as st 2 3# from langchain.llms import OpenAI 4from langchain.llms import LlamaCpp 5 6#from langchain.embeddings import OpenAIEmbeddings 7from langchain.embeddings import HuggingFaceInstructEmbeddings 8 9 10from langchain.vectorstores import Chroma 11 12from langchain.agents.agent_toolkits import ( 13 create_vectorstore_agent, 14 VectorStoreToolkit, 15 VectorStoreInfo 16) 17 18from langchain.vectorstores import Chroma 19 20from langchain.agents.agent_toolkits import ( 21 create_vectorstore_agent, 22 VectorStoreToolkit, 23 VectorStoreInfo 24) 25 26#embeddings=OpenAIEmbeddings() 27embeddings = HuggingFaceInstructEmbeddings() 28 29# llm = OpenAI(temperature=0.9,verbose=True) 30 31# https://python.langchain.com/docs/integrations/llms/llamacpp 32# https://github.com/langchain-ai/langchain/issues/8004 33# https://github.com/abetlen/llama-cpp-python/blob/main/docs/install/macos.md 34 35llm = LlamaCpp( 36 model_path="llama-2-7b-chat.ggmlv3.q4_0.bin", 37 verbose=True, 38 temperature=1, 39 max_tokens=2048, # 256 40 n_gpu_layers=1, 41 n_batch=512, 42 f16_kv=True, # MUST set to True, otherwise you will run into problem after a couple of calls 43 n_ctx=10240 # Context Length 44) 45 46load_directory = "Chroma2" 47 48# load embeddings from "Chroma" directory 49db = Chroma(persist_directory=load_directory,collection_name='hp4',embedding_function=embeddings) 50 51vectorstore_info = VectorStoreInfo( 52 name="hp4", 53 description="embeddings generated from the pdf document", 54 vectorstore=db 55) 56 57toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info) 58 59agent_executor = create_vectorstore_agent( 60 llm=llm, 61 toolkit=toolkit, 62 verbose=True 63) 64 65prompt = st.text_input('Input your prompt') 66 67if prompt: 68 response = agent_executor.run(prompt) 69 st.write(response) 70 71 with st.expander('Document Similarity Search'): 72 search = db.similarity_search_with_score(prompt) 73 st.write(search)
app3a.py
TEXT
1#!pip install streamlit 2#!pip install llama-cpp-python 3#!pip install watchdog 4 5from dotenv import load_dotenv, find_dotenv 6from langchain.callbacks import get_openai_callback 7from langchain.chat_models import ChatOpenAI 8from langchain.schema import SystemMessage, HumanMessage, AIMessage 9from langchain.llms import LlamaCpp 10from langchain.callbacks.manager import CallbackManager 11from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 12import streamlit as st 13 14from typing import Union 15from typing import List 16 17 18def init_page() -> None: 19 st.set_page_config(page_title="Pirate Jack") 20 st.header("Pirate Jack") 21 st.sidebar.title("Options") 22 23 24def init_messages() -> None: 25 clear_button = st.sidebar.button("Clear Conversation", key="clear") 26 if clear_button or "messages" not in st.session_state: 27 st.session_state.messages = [ 28 SystemMessage( 29 # content="You are a helpful AI assistant. Reply your answer in mardkown format.") 30 content="You are now Pirate Jack. Always talk like a pirate. Let's think step by step." 31 ) 32 ] 33 st.session_state.costs = [] 34 35 36def select_llm() -> Union[ChatOpenAI, LlamaCpp]: 37 model_name = st.sidebar.radio( 38 "Choose LLM:", 39 ( 40 "llama-2-7b-chat.ggmlv3.q2_K", 41 "codeup-llama-2-13b-chat-hf.ggmlv3.q2_K", 42 "stablebeluga-13b.ggmlv3.q4_K_S", 43 "gpt-3.5-turbo-0613", 44 "gpt-4", 45 ), 46 ) 47 48 temperature = st.sidebar.slider( 49 "Temperature:", min_value=0.0, max_value=1.0, value=0.0, step=0.01 50 ) 51 if model_name.startswith("gpt-"): 52 return ChatOpenAI(temperature=temperature, model_name=model_name) 53 elif ( 54 model_name.startswith("llama-2-") 55 or model_name.startswith("codeup-llama-2-") 56 or model_name.startswith("stablebeluga-") 57 ): 58 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) 59 60 if model_name.startswith("llama-2-"): 61 return LlamaCpp( 62 model_path=f"./{model_name}.bin", 63 callback_manager=callback_manager, 64 verbose=True, # False, # True 65 temperature=temperature, 66 max_tokens=2048, # 256 67 n_ctx=1024, # Context Length 68 n_gpu_layers = 1, # Metal set to 1 is enough. 69 n_batch = 4, # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip. 70 f16_kv=True, # MUST set to True, otherwise you will run into problem after a couple of calls 71 ) 72 else: 73 return LlamaCpp( 74 model_path=f"./{model_name}.bin", 75 callback_manager=callback_manager, 76 verbose=True, # False, # True 77 temperature=temperature, 78 max_tokens=2048, # 256 79 ) 80 81 82 83def get_answer(llm, messages) -> tuple[str, float]: 84 if isinstance(llm, ChatOpenAI): 85 with get_openai_callback() as cb: 86 answer = llm(messages) 87 return answer.content, cb.total_cost 88 if isinstance(llm, LlamaCpp): 89 answer = llm(llama_v2_prompt(convert_langchainschema_to_dict(messages))) 90 return (answer, 0.0) 91 92 93def find_role(message: Union[SystemMessage, HumanMessage, AIMessage]) -> str: 94 """ 95 Identify role name from langchain.schema object. 96 """ 97 if isinstance(message, SystemMessage): 98 return "system" 99 if isinstance(message, HumanMessage): 100 return "user" 101 if isinstance(message, AIMessage): 102 return "assistant" 103 raise TypeError("Unknown message type.") 104 105 106def convert_langchainschema_to_dict( 107 messages: List[Union[SystemMessage, HumanMessage, AIMessage]] 108) -> List[dict]: 109 """ 110 Convert the chain of chat messages in list of langchain.schema format to 111 list of dictionary format. 112 """ 113 return [ 114 {"role": find_role(message), "content": message.content} for message in messages 115 ] 116 117 118def llama_v2_prompt(messages: List[dict]) -> str: 119 """ 120 Convert the messages in list of dictionary format to Llama2 compliant format. 121 """ 122 B_INST, E_INST = "[INST]", "[/INST]" 123 B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n" 124 BOS, EOS = "<s>", "</s>" 125 DEFAULT_SYSTEM_PROMPT = f"""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" 126 127 if messages[0]["role"] != "system": 128 messages = [ 129 { 130 "role": "system", 131 "content": DEFAULT_SYSTEM_PROMPT, 132 } 133 ] + messages 134 messages = [ 135 { 136 "role": messages[1]["role"], 137 "content": B_SYS + messages[0]["content"] + E_SYS + messages[1]["content"], 138 } 139 ] + messages[2:] 140 141 messages_list = [ 142 f"{BOS}{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} {EOS}" 143 for prompt, answer in zip(messages[::2], messages[1::2]) 144 ] 145 messages_list.append( 146 f"{BOS}{B_INST} {(messages[-1]['content']).strip()} {E_INST} {EOS}" 147 ) 148 149 result = "".join(messages_list) 150 print(result) 151 152 return result 153 154 155def main() -> None: 156 _ = load_dotenv(find_dotenv()) 157 158 init_page() 159 llm = select_llm() 160 init_messages() 161 162 # Supervise user input 163 if user_input := st.chat_input("Input your question!"): 164 st.session_state.messages.append(HumanMessage(content=user_input)) 165 with st.spinner("Pirate Jack be thinking ..."): 166 result = get_answer(llm, st.session_state.messages) 167 if result == None: 168 st.session_state.messages.append( 169 AIMessage(content="Sorry, I don't know the answer.") 170 ) 171 else: 172 answer, cost = result 173 st.session_state.messages.append(AIMessage(content=answer)) 174 st.session_state.costs.append(cost) 175 176 # Display chat history 177 messages = st.session_state.get("messages", []) 178 for message in messages: 179 if isinstance(message, AIMessage): 180 with st.chat_message("assistant"): 181 st.markdown(message.content) 182 elif isinstance(message, HumanMessage): 183 with st.chat_message("user"): 184 st.markdown(message.content) 185 186 costs = st.session_state.get("costs", []) 187 st.sidebar.markdown("## Costs") 188 st.sidebar.markdown(f"**Total cost: ${sum(costs):.5f}**") 189 for cost in costs: 190 st.sidebar.markdown(f"- ${cost:.5f}") 191 192 193# streamlit run app.py 194if __name__ == "__main__": 195 main()