-
Notifications
You must be signed in to change notification settings - Fork 316
Expand file tree
/
Copy path09_pinecone.py
More file actions
147 lines (114 loc) · 4.81 KB
/
09_pinecone.py
File metadata and controls
147 lines (114 loc) · 4.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from dotenv import load_dotenv
load_dotenv()
import os
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from pinecone import Pinecone, ServerlessSpec
from llama_index.core import (
SimpleDirectoryReader,
VectorStoreIndex,
Settings
)
# PineconeReader only used in commented-out code, requires llama-index-readers-pinecone package
# from llama_index.readers.pinecone import PineconeReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.storage.storage_context import StorageContext
from llama_index.llms.openai import OpenAI # Use llama_index's native OpenAI, not LangChain's
# reader = PineconeReader(
# api_key=os.getenv("PINECONE_API_KEY"),
# environment="us-west4-gcp"
# )
# docs_from_pinecone = reader.load_data(index_name="nietzsche")
urls = [
"https://www.projekt-gutenberg.org/nietzsch/wanderer/wanderer.html",
"https://www.projekt-gutenberg.org/nietzsch/wanderer/wande002.html",
"https://www.projekt-gutenberg.org/nietzsch/wanderer/wande003.html",
"https://www.projekt-gutenberg.org/nietzsch/wanderer/wande004.html",
]
def scrape_book(urls):
for url in urls:
result = []
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
# keep only the heading tags up to h3, and p tags
text = soup.find_all(["h1", "h2", "h3", "p"])
# remove the tags and keep the inner text
text = [t.text for t in text]
for i in text:
try:
result.append(i.encode('latin').decode("utf-8"))
except:
pass
book_path = Path("book")
if not book_path.exists():
book_path.mkdir()
pagename = url.split("/")[-1]
with open(book_path / f"{pagename}.txt", "w") as f:
f.write("\n".join(result))
def create_pages(urls):
pages = []
for url in urls:
pagename = url.split("/")[-1]
pages.append(pagename)
return pages
def build_docs(pages):
docs = {}
for page in pages:
docs[page] = SimpleDirectoryReader(
input_files=[f"book/{page}.txt"]
).load_data()
return docs
def build_context(model_name):
# In llama_index 0.10+, use Settings instead of ServiceContext
Settings.llm = OpenAI(temperature=0, model=model_name)
return None # No longer need to return ServiceContext
def build_index(pages, docs):
page_indices = {}
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
# create a Pinecone index if you don't have one
# https://openai.com/blog/new-and-improved-embedding-model (12288 -> 1536 dimensions)
# Free tier supports us-east-1 region
# pc.create_index(name="nietzsche", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))
pinecone_index = pc.Index("nietzsche")
# pinecone_index.upsert("nietzsche_wandere", [1,2,3])
# pinecone_index.describe_index_stats()
# pinecone_index.delete()
build_context("gpt-3.5-turbo") # Set global Settings.llm
for page in pages:
vector_store = PineconeVectorStore(
pinecone_index=pinecone_index,
metadata_filters={"page": page}
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
page_indices[page] = VectorStoreIndex.from_documents(
docs[page], storage_context=storage_context
)
# index_struct.index_id is deprecated in llama_index 0.10+
print("Indexing complete.")
return page_indices
if __name__ == "__main__":
# uncomment this to download books from project guternberg
# scrape_book(urls)
# assuming books have already been downloaded into your local directory
pages = create_pages(urls)
docs = build_docs(pages)
# print(docs.keys())
indices = build_index(pages, docs)
# response = indices["wande002.html"].as_query_engine().query(
# "What are Nietzsche's view on religion? Answer in the original German text, and provide an English translation for the answer"
# )
PROMPT_TEMPLATE = (
"Here are the context information:"
"\n-----------------------------\n"
"{context_str}"
"\n-----------------------------\n"
"Answer the following question in the original German text, and provide an english translation and explanation in as instructive and educational way as possible: {query_str} \n"
)
# QuestionAnswerPrompt is deprecated in llama_index 0.10+, use PromptTemplate instead
from llama_index.core import PromptTemplate
QA_PROMPT = PromptTemplate(PROMPT_TEMPLATE)
query_engine = indices["wande002.html"].as_query_engine(text_qa_template=QA_PROMPT)
response = query_engine.query("What are important things according to Nietzsche?")
print(str(response))
print(response.get_formatted_sources())