llm-python/09_pinecone.py at main · onlyphantom/llm-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from dotenv import load_dotenv
load_dotenv()


import os
from pathlib import Path
import requests
from bs4 import BeautifulSoup

from pinecone import Pinecone, ServerlessSpec
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Settings
)
# PineconeReader only used in commented-out code, requires llama-index-readers-pinecone package
# from llama_index.readers.pinecone import PineconeReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.storage.storage_context import StorageContext
from llama_index.llms.openai import OpenAI  # Use llama_index's native OpenAI, not LangChain's

# reader = PineconeReader(
#     api_key=os.getenv("PINECONE_API_KEY"),
#     environment="us-west4-gcp"
# )
# docs_from_pinecone = reader.load_data(index_name="nietzsche")

urls = [
    "https://www.projekt-gutenberg.org/nietzsch/wanderer/wanderer.html",
    "https://www.projekt-gutenberg.org/nietzsch/wanderer/wande002.html",
    "https://www.projekt-gutenberg.org/nietzsch/wanderer/wande003.html",
    "https://www.projekt-gutenberg.org/nietzsch/wanderer/wande004.html",
]

def scrape_book(urls):

    for url in urls:
        result = []
        req = requests.get(url)
        soup = BeautifulSoup(req.text, "html.parser")

        # keep only the heading tags up to h3, and p tags
        text = soup.find_all(["h1", "h2", "h3", "p"])

        # remove the tags and keep the inner text
        text = [t.text for t in text]

        for i in text:
            try:
                result.append(i.encode('latin').decode("utf-8"))
            except:
                pass

        book_path = Path("book")
        if not book_path.exists():
            book_path.mkdir()

        pagename = url.split("/")[-1]

        with open(book_path / f"{pagename}.txt", "w") as f:
            f.write("\n".join(result))

def create_pages(urls):

    pages = []
    for url in urls:
        pagename = url.split("/")[-1]
        pages.append(pagename)

    return pages

def build_docs(pages):
    docs = {}
    for page in pages:
        docs[page] = SimpleDirectoryReader(
            input_files=[f"book/{page}.txt"]
        ).load_data()
    return docs

def build_context(model_name):
    # In llama_index 0.10+, use Settings instead of ServiceContext
    Settings.llm = OpenAI(temperature=0, model=model_name)
    return None  # No longer need to return ServiceContext

def build_index(pages, docs):

    page_indices = {}
    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

    # create a Pinecone index if you don't have one
    # https://openai.com/blog/new-and-improved-embedding-model (12288 -> 1536 dimensions)
    # Free tier supports us-east-1 region
    # pc.create_index(name="nietzsche", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))

    pinecone_index = pc.Index("nietzsche")

    # pinecone_index.upsert("nietzsche_wandere", [1,2,3])
    # pinecone_index.describe_index_stats()
    # pinecone_index.delete()

    build_context("gpt-3.5-turbo")  # Set global Settings.llm

    for page in pages:

        vector_store = PineconeVectorStore(
            pinecone_index=pinecone_index,
            metadata_filters={"page": page}
        )
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        page_indices[page] = VectorStoreIndex.from_documents(
            docs[page], storage_context=storage_context
        )
        # index_struct.index_id is deprecated in llama_index 0.10+

    print("Indexing complete.")
    return page_indices

if __name__ == "__main__":
    # uncomment this to download books from project guternberg
    # scrape_book(urls)
    # assuming books have already been downloaded into your local directory
    pages = create_pages(urls)
    docs = build_docs(pages)
    # print(docs.keys())
    indices = build_index(pages, docs)

    # response = indices["wande002.html"].as_query_engine().query(
    #     "What are Nietzsche's view on religion? Answer in the original German text, and provide an English translation for the answer"
    # )

    PROMPT_TEMPLATE = (
        "Here are the context information:"
        "\n-----------------------------\n"
        "{context_str}"
        "\n-----------------------------\n"
        "Answer the following question in the original German text, and provide an english translation and explanation in as instructive and educational way as possible: {query_str} \n"
    )

    # QuestionAnswerPrompt is deprecated in llama_index 0.10+, use PromptTemplate instead
    from llama_index.core import PromptTemplate
    QA_PROMPT = PromptTemplate(PROMPT_TEMPLATE)
    query_engine = indices["wande002.html"].as_query_engine(text_qa_template=QA_PROMPT)
    response = query_engine.query("What are important things according to Nietzsche?")

    print(str(response))
    print(response.get_formatted_sources())