intel-oneAPI/Backend/Final_NotesChunker.py at cf46a794b062f33423097e6cae858b405304ecdb · hack2skill/intel-oneAPI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from fastapi import APIRouter
import boto3
import openai
import time

s3_access_key = ""
s3_secret_access_key = ""
s3_bucket_name = "learnmateai"

s3 = boto3.client("s3", aws_access_key_id=s3_access_key, aws_secret_access_key=s3_secret_access_key)

# Set up OpenAI API credentials
openai.api_key = ''

def batch_text(input_text, delimiter="TOPIC:"):
    batches = input_text.split(delimiter)
    cleaned_batches = [batch.strip() for batch in batches if batch.strip()]
    return cleaned_batches

def upload_to_s3(bucket_name, folder_name, file_name, content):
    s3 = boto3.client('s3')
    key = folder_name + '/' + file_name
    s3.put_object(Body=content, Bucket=bucket_name, Key=key)

app = APIRouter()

@app.get("/process_files")
def process_files(user: str):
    user=user+"/"
    # Function to read and process a file
    def process_file(file_name):
        # Read file from S3
        response = s3.get_object(Bucket='learnmateai', Key=user+'notes_txt/' + file_name)
        file_content = response['Body'].read().decode('utf-8')

        # Split file content into batches (adjust batch size as needed)
        batch_size = 3000
        batches = [file_content[i:i+batch_size] for i in range(0, len(file_content), batch_size)]

        # Process batches
        for batch in batches:
            # Send batch to OpenAI API


            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {
                        "role": "user",
                        "content": f"divide the text topic wise (it should look like TOPIC:notes) notes should very breif and be created in a way so that you will be able to recreate the full txt :\n\n{batch}\n\n"
                    }
                ]
            )

            important_topics = response.choices[0].message.content
            #print(important_topics)
            #return important_topics
            # Add a delay of 20 seconds to handle rate limit
            time.sleep(20)

            text_batches = batch_text(important_topics)

            bucket_name = 'learnmateai'
            file=file_name.split(".")[0]
            folder_name = f'{user}Analysed_Notes/{file}'

            for i, batch in enumerate(text_batches):
                lines = batch.split('\n')
                file_name1 = lines[0].strip().replace(" ", "_") + '.txt'
                content = '\n'.join(lines[1:]).strip()
                upload_to_s3(bucket_name, folder_name, file_name1, content)

                # Print uploaded file information
                print(f"File '{file_name1}' uploaded to '{bucket_name}/{folder_name}'")

    # Get the list of files in the "notes_txt" folder
    response = s3.list_objects_v2(Bucket='learnmateai', Prefix=user+'notes_txt/')

    # Process each file
    for file in response['Contents']:
        file_name = file['Key'].split('/')[-1]
        process_file(file_name)

    return {"message": "NOTES"}