diff --git a/rag_and_distilled_model/Apollo11_rag&distilled.ipynb b/rag_and_distilled_model/Apollo11_rag&distilled.ipynb index 4da0f4e..6d21bcc 100644 --- a/rag_and_distilled_model/Apollo11_rag&distilled.ipynb +++ b/rag_and_distilled_model/Apollo11_rag&distilled.ipynb @@ -20,7 +20,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install somepackage -qq langchain langchain-community langchain-core langchain-text-splitters langchain-huggingface sentence-transformers chromadb transformers torch accelerate unstructured" + "%pip install somepackage -qq langchain langchain-community langchain-core langchain-text-splitters langchain-huggingface sentence-transformers chromadb transformers torch accelerate unstructured codecarbon" ] }, { @@ -28,11 +28,8 @@ "id": "ed814cfe", "metadata": {}, "source": [ - "# 2. Import libraries and set configuration\n", - "\n", - "Here we import the necessary modules and define paths, constants,\n", - "and model settings.\n", - "We also suppress warnings to keep the notebook output clean." + "# 2. Imports and Configuration\n", + "Imports necessary libraries, sets constants for models, embeddings, chunking, and loads prompt templates from a JSON file for various NLP tasks.\n" ] }, { @@ -44,6 +41,7 @@ "source": [ "from pathlib import Path\n", "import json\n", + "import time\n", "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "from langchain_community.vectorstores import Chroma\n", "from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline\n", @@ -51,6 +49,7 @@ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline\n", "import torch\n", "import warnings\n", + "from codecarbon import OfflineEmissionsTracker\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", @@ -62,18 +61,39 @@ "TOP_K_RESULTS = 5\n", "RELEVANCE_THRESHOLD = 0.3\n", "LLM_MODEL = \"MBZUAI/LaMini-Flan-T5-248M\"\n", - "MAX_NEW_TOKENS = 100\n", - "LLM_TEMPERATURE = 0.2\n", + "MAX_NEW_TOKENS = 200\n", "USE_GPU = torch.cuda.is_available()\n", + "COUNTRY_ISO_CODE = \"EGY\"\n", + "ENABLE_RECURSIVE_EDITING = True\n", + "MAX_EDIT_ITERATIONS = 2\n", + "\n", + "with open(PROMPTS_FILE, \"r\") as f:\n", + " config_data = json.load(f)\n", + " MASTER_INSTRUCTION = config_data[\"metadata\"][\"master_instruction\"]\n", + " TASK_INSTRUCTIONS = config_data[\"metadata\"][\"task_instructions\"]\n", "\n", - "PROMPT_TEMPLATE = \"\"\"Answer the question about Apollo 11 based on the context below. If you cannot answer based on the context, say \"I don't have enough information to answer that.\"\n", + "\n", + "def build_prompt_template(task_type):\n", + " task_instruction = TASK_INSTRUCTIONS[task_type]\n", + " return f\"\"\"{MASTER_INSTRUCTION}\n", + "\n", + "{task_instruction}\n", "\n", "Context:\n", - "{context}\n", + "{{context}}\n", + "\n", + "Question: {{question}}\n", "\n", - "Question: {question}\n", + "Answer:\"\"\"\n", "\n", - "Answer:\"\"\"" + "\n", + "PROMPT_TEMPLATES = {\n", + " \"summarization\": build_prompt_template(\"summarization\"),\n", + " \"reasoning\": build_prompt_template(\"reasoning\"),\n", + " \"rag\": build_prompt_template(\"rag\"),\n", + " \"paraphrasing\": build_prompt_template(\"paraphrasing\"),\n", + " \"creative_generation\": build_prompt_template(\"creative_generation\"),\n", + "}" ] }, { @@ -107,8 +127,7 @@ "source": [ "# 4. Load the local language model\n", "\n", - "We initialize a small, local LLM (LaMini-Flan-T5) that can run on CPU or GPU.\n", - "This model will later generate answers based on retrieved context." + "Initializes the HuggingFace Seq2Seq model and tokenizer, wraps it in a pipeline for text generation, and tracks energy usage with `OfflineEmissionsTracker`.\n" ] }, { @@ -118,30 +137,26 @@ "metadata": {}, "outputs": [], "source": [ - "def initialize_local_llm():\n", - " device = 0 if USE_GPU else -1\n", - " tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)\n", - " model = AutoModelForSeq2SeqLM.from_pretrained(\n", - " LLM_MODEL,\n", - " torch_dtype=torch.float16 if USE_GPU else torch.float32,\n", - " device_map=\"auto\" if USE_GPU else None,\n", - " low_cpu_mem_usage=True,\n", - " )\n", - " pipe = pipeline(\n", - " \"text2text-generation\",\n", - " model=model,\n", - " tokenizer=tokenizer,\n", - " max_new_tokens=MAX_NEW_TOKENS,\n", - " temperature=LLM_TEMPERATURE,\n", - " repetition_penalty=1.2,\n", - " do_sample=False,\n", - " top_p=0.95,\n", - " device=device,\n", - " )\n", - " return HuggingFacePipeline(pipeline=pipe)\n", - "\n", - "\n", - "llm = initialize_local_llm()" + "tracker_loading = OfflineEmissionsTracker(\n", + " country_iso_code=COUNTRY_ISO_CODE, project_name=\"model_loading\", log_level=\"error\"\n", + ")\n", + "tracker_loading.start()\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)\n", + "model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL, low_cpu_mem_usage=True)\n", + "pipe = pipeline(\n", + " \"text2text-generation\",\n", + " model=model,\n", + " tokenizer=tokenizer,\n", + " max_new_tokens=MAX_NEW_TOKENS,\n", + " do_sample=False,\n", + " repetition_penalty=1.3,\n", + " device=0 if USE_GPU else -1,\n", + " truncation=True,\n", + " max_length=512,\n", + ")\n", + "llm = HuggingFacePipeline(pipeline=pipe)\n", + "emissions_loading = tracker_loading.stop()" ] }, { @@ -149,7 +164,7 @@ "id": "59668d86", "metadata": {}, "source": [ - "# Load documents from JSON\n", + "# 5. Load documents from JSON\n", "\n", "We read the context and metadata directly from a JSON file.\n", "We also clean metadata and split text into chunks." @@ -263,7 +278,13 @@ "metadata": {}, "outputs": [], "source": [ - "db = build_chroma_store(documents)" + "tracker_embeddings = OfflineEmissionsTracker(country_iso_code=COUNTRY_ISO_CODE)\n", + "tracker_embeddings.start()\n", + "\n", + "db = build_chroma_store(documents)\n", + "\n", + "emissions_embeddings = tracker_embeddings.stop()\n", + "print(f\"Embeddings creation emissions: {emissions_embeddings:.6f} kg CO2\")" ] }, { @@ -271,10 +292,15 @@ "id": "670d000f", "metadata": {}, "source": [ - "# 9. Define query and response generation\n", + "# 9. RAG Response Generation and Answer Refinement\n", + "\n", + "Defines functions to generate answers using Retrieval-Augmented Generation (RAG), detect issues in responses, and refine them:\n", "\n", - "These functions retrieve the most relevant text chunks and use the\n", - "LLM to answer a question." + "- `detect_answer_issues()`: Checks if an answer is incomplete, repetitive, cut off, or disclaimer-only. \n", + "- `retry_with_better_retrieval()`: Performs additional retrieval when issues are detected. \n", + "- `refine_failed_answer()`: Re-generates answers based on improved context and task-specific instructions. \n", + "- `generate_rag_response()`: Combines retrieval, LLM generation, and recursive refinement to produce a final answer. \n", + "- `ask()`: Simple wrapper to query the system and print the result.\n" ] }, { @@ -284,59 +310,254 @@ "metadata": {}, "outputs": [], "source": [ - "def query_database(query_text, k=TOP_K_RESULTS, threshold=RELEVANCE_THRESHOLD):\n", + "def detect_answer_issues(\n", + " answer, question, tokens_generated=None, max_tokens=MAX_NEW_TOKENS\n", + "):\n", + " answer_lower = answer.lower().strip()\n", + "\n", + " if (\n", + " \"no relevant information\" in answer_lower\n", + " or \"does not provide information\" in answer_lower\n", + " ):\n", + " return True, \"no_info\"\n", + "\n", + " sentences = [s.strip() for s in answer.split(\".\") if len(s.strip()) > 10]\n", + " if len(sentences) >= 2:\n", + " sentence_counts = {}\n", + " for sent in sentences:\n", + " sent_normalized = sent.lower().strip()\n", + " if sent_normalized:\n", + " sentence_counts[sent_normalized] = (\n", + " sentence_counts.get(sent_normalized, 0) + 1\n", + " )\n", + "\n", + " if any(count > 1 for count in sentence_counts.values()):\n", + " return True, \"repetitive\"\n", + "\n", + " if tokens_generated and tokens_generated >= max_tokens - 5:\n", + " return True, \"token_limit\"\n", + "\n", + " if answer and len(answer) > 20:\n", + " last_char = answer.strip()[-1]\n", + " if last_char not in '.!?\":)]}':\n", + " return True, \"incomplete\"\n", + "\n", + " if len(sentences) > 1:\n", + " last_sentence = sentences[-1].strip()\n", + " if len(last_sentence) > 0 and len(last_sentence) < 20:\n", + " if last_sentence and last_sentence[0].islower():\n", + " return True, \"cutoff\"\n", + "\n", + " disclaimer_phrases = [\n", + " \"i'm sorry\",\n", + " \"i cannot\",\n", + " \"i don't have\",\n", + " \"not possible to determine\",\n", + " \"context does not\",\n", + " ]\n", + "\n", + " if len(answer) < 150 and any(\n", + " phrase in answer_lower for phrase in disclaimer_phrases\n", + " ):\n", + " substantial_sentences = [\n", + " s\n", + " for s in sentences\n", + " if len(s.strip()) > 30\n", + " and not any(phrase in s.lower() for phrase in disclaimer_phrases)\n", + " ]\n", + " if len(substantial_sentences) == 0:\n", + " return True, \"disclaimer_only\"\n", + "\n", + " return False, None\n", + "\n", + "\n", + "def retry_with_better_retrieval(query_text, task_type, original_context, issue_type):\n", + " if issue_type == \"no_info\":\n", + " k = 8\n", + " threshold = 0.15\n", + " elif issue_type in [\"incomplete\", \"cutoff\", \"token_limit\", \"repetitive\"]:\n", + " k = 3\n", + " threshold = RELEVANCE_THRESHOLD\n", + " elif issue_type == \"disclaimer_only\":\n", + " k = 7\n", + " threshold = 0.2\n", + " else:\n", + " k = TOP_K_RESULTS\n", + " threshold = RELEVANCE_THRESHOLD\n", + "\n", " results = db.similarity_search_with_relevance_scores(query_text, k=k)\n", "\n", " if len(results) == 0 or results[0][1] < threshold:\n", - " return []\n", + " return None, None\n", + "\n", + " if issue_type in [\"incomplete\", \"cutoff\", \"token_limit\", \"repetitive\"]:\n", + " context_parts = [doc.page_content for doc, _ in results[:3]]\n", + " context_text = \"\\n\\n\".join(context_parts)[:600]\n", + " else:\n", + " context_parts = [doc.page_content for doc, _ in results]\n", + " context_text = \"\\n\\n\".join(context_parts)[:1200]\n", + "\n", + " return context_text, [score for _, score in results]\n", + "\n", + "\n", + "def refine_failed_answer(original_answer, context, query_text, issue_type):\n", + " if issue_type == \"no_info\":\n", + " refine_prompt = f\"\"\"Context: {context[:700]}\n", "\n", - " return results\n", + "Question: {query_text}\n", + "\n", + "Answer the question using only the information from the context above.\n", + "\n", + "Answer:\"\"\"\n", + "\n", + " elif issue_type == \"repetitive\":\n", + " refine_prompt = f\"\"\"Context: {context[:500]}\n", + "\n", + "Question: {query_text}\n", + "\n", + "Provide a concise answer without repeating information. 2-3 distinct sentences:\n", + "\n", + "Answer:\"\"\"\n", + "\n", + " elif issue_type in [\"incomplete\", \"cutoff\", \"token_limit\"]:\n", + " refine_prompt = f\"\"\"Context: {context[:500]}\n", + "\n", + "Question: {query_text}\n", + "\n", + "Provide a concise, complete answer in 2-3 sentences:\n", + "\n", + "Answer:\"\"\"\n", + "\n", + " elif issue_type == \"disclaimer_only\":\n", + " refine_prompt = f\"\"\"Context: {context[:700]}\n", + "\n", + "Question: {query_text}\n", + "\n", + "Answer the question directly using information from the context. Be specific and factual.\n", + "\n", + "Answer:\"\"\"\n", + "\n", + " else:\n", + " return original_answer\n", + "\n", + " tokens = tokenizer.encode(refine_prompt, truncation=False)\n", + " if len(tokens) > 450:\n", + " refine_prompt = tokenizer.decode(tokens[:450], skip_special_tokens=True)\n", + " refine_prompt += f\"\\n\\nQuestion: {query_text}\\n\\nAnswer:\"\n", + "\n", + " refined = llm.invoke(refine_prompt)\n", + " return refined.strip()\n", "\n", "\n", "def generate_rag_response(\n", - " query_text, k=TOP_K_RESULTS, threshold=RELEVANCE_THRESHOLD, verbose=False\n", + " query_text, task_type, k=TOP_K_RESULTS, threshold=RELEVANCE_THRESHOLD\n", "):\n", " results = db.similarity_search_with_relevance_scores(query_text, k=k)\n", "\n", - " if len(results) == 0 or results[0][1] < threshold:\n", - " return {\n", - " \"answer\": \"No relevant information found.\",\n", - " \"sources\": [],\n", - " \"context\": \"\",\n", - " \"prompt\": \"\",\n", - " }\n", + " if task_type == \"creative_generation\":\n", + " threshold = 0.1\n", "\n", - " context_text = \"\\n\\n---\\n\\n\".join([doc.page_content for doc, _score in results])\n", - " prompt_template = PromptTemplate.from_template(PROMPT_TEMPLATE)\n", + " if len(results) == 0 or results[0][1] < threshold:\n", + " if task_type == \"creative_generation\" and len(results) > 0:\n", + " context_text = results[0][0].page_content\n", + " else:\n", + " return {\n", + " \"answer\": \"No relevant information found.\",\n", + " \"sources\": [],\n", + " \"task_type\": task_type,\n", + " \"scores\": [],\n", + " \"iterations\": 0,\n", + " \"issue_detected\": \"no_info\",\n", + " \"fixed\": False,\n", + " \"should_retry\": True,\n", + " }\n", + " else:\n", + " context_text = \"\\n\\n\".join([doc.page_content for doc, _score in results])\n", + "\n", + " prompt_template = PromptTemplate.from_template(PROMPT_TEMPLATES[task_type])\n", " prompt = prompt_template.format(context=context_text, question=query_text)\n", "\n", - " if llm is None:\n", - " return {\n", - " \"answer\": \"LLM not initialized.\",\n", - " \"sources\": [],\n", - " \"context\": context_text,\n", - " \"prompt\": prompt,\n", - " }\n", + " tokens = tokenizer.encode(prompt, truncation=False)\n", + " token_limit_exceeded = len(tokens) > 400\n", + "\n", + " if token_limit_exceeded:\n", + " truncated_tokens = tokens[:450]\n", + " truncated_prompt = tokenizer.decode(truncated_tokens, skip_special_tokens=True)\n", + " prompt = truncated_prompt + f\"\\n\\nQuestion: {query_text}\\n\\nAnswer:\"\n", + " if \"Context:\" in truncated_prompt:\n", + " try:\n", + " context_text = (\n", + " truncated_prompt.split(\"Context:\")[1].split(\"Question:\")[0].strip()\n", + " )\n", + " except Exception as e:\n", + " print(f\"Error extracting context: {e}\")\n", + " pass\n", + "\n", + " answer = llm.invoke(prompt)\n", + "\n", + " answer_tokens = len(tokenizer.encode(answer, truncation=False))\n", "\n", - " response_text = llm.invoke(prompt)\n", " sources = [doc.metadata.get(\"source\", \"Unknown\") for doc, _score in results]\n", "\n", - " if verbose:\n", - " print(f\"\\nQuery: {query_text}\")\n", - " print(f\"\\nAnswer: {response_text}\")\n", - " print(f\"\\nSources: {', '.join([Path(s).name for s in sources])}\")\n", + " iterations = 0\n", + " issue_detected = None\n", + " fixed = False\n", + "\n", + " if ENABLE_RECURSIVE_EDITING:\n", + " has_issues, issue_type = detect_answer_issues(\n", + " answer, query_text, answer_tokens, MAX_NEW_TOKENS\n", + " )\n", + "\n", + " if has_issues:\n", + " issue_detected = issue_type\n", + "\n", + " for iteration in range(MAX_EDIT_ITERATIONS):\n", + " new_context, new_scores = retry_with_better_retrieval(\n", + " query_text, task_type, context_text, issue_type\n", + " )\n", + "\n", + " if new_context:\n", + " context_text = new_context\n", + " if new_scores:\n", + " sources = [\n", + " doc.metadata.get(\"source\", \"Unknown\")\n", + " for doc, _ in results[: len(new_scores)]\n", + " ]\n", + "\n", + " iterations += 1\n", + " refined_answer = refine_failed_answer(\n", + " answer, context_text, query_text, issue_type\n", + " )\n", + "\n", + " refined_tokens = len(tokenizer.encode(refined_answer, truncation=False))\n", + " still_has_issues, _ = detect_answer_issues(\n", + " refined_answer, query_text, refined_tokens, MAX_NEW_TOKENS\n", + " )\n", + "\n", + " if not still_has_issues and refined_answer != answer:\n", + " answer = refined_answer\n", + " fixed = True\n", + " break\n", + " elif refined_answer != answer:\n", + " answer = refined_answer\n", + " else:\n", + " break\n", "\n", " return {\n", - " \"answer\": response_text,\n", + " \"answer\": answer,\n", " \"sources\": sources,\n", - " \"context\": context_text,\n", - " \"prompt\": prompt,\n", + " \"task_type\": task_type,\n", " \"scores\": [score for _, score in results],\n", + " \"iterations\": iterations,\n", + " \"issue_detected\": issue_detected,\n", + " \"fixed\": fixed,\n", " }\n", "\n", "\n", - "def ask(query_text):\n", - " result = generate_rag_response(query_text, verbose=True)\n", + "def ask(question, task_type=\"rag\"):\n", + " result = generate_rag_response(question, task_type)\n", + " print(f\"\\nQ: {question}\")\n", + " print(f\"A: {result['answer']}\")\n", " return result[\"answer\"]" ] }, @@ -362,11 +583,7 @@ " prompts_data = json.load(f)\n", "\n", "prompts = prompts_data[\"prompts\"]\n", - "print(f\"Loaded {len(prompts)} evaluation prompts\")\n", - "print(\"\\nCategories:\")\n", - "for category in [\"summarization\", \"reasoning\", \"rag\"]:\n", - " count = len([p for p in prompts if p[\"category\"] == category])\n", - " print(f\" - {category.title()}: {count} prompts\")" + "print(f\"Loaded {len(prompts)} evaluation prompts\")" ] }, { @@ -374,10 +591,14 @@ "id": "1001c44c", "metadata": {}, "source": [ - "# 11. Run automated evaluation\n", + "# 11. Run Prompts and Track Metrics\n", "\n", - "For each question, we generate an answer using the RAG system and print\n", - "both the model’s response and the expected answer (if provided)." + "Iterates over all prompts, generates RAG responses, retries if necessary, and collects metrics:\n", + "\n", + "- Measures **latency** and **energy usage** using `OfflineEmissionsTracker`. \n", + "- Retries queries with a lower threshold if no relevant information is found and recursive editing is enabled. \n", + "- Tracks **task-level success rates** and response metadata (`iterations`, `issue_detected`, `fixed`). \n", + "- Stores all results in a list for later analysis.\n" ] }, { @@ -388,31 +609,163 @@ "outputs": [], "source": [ "results = []\n", + "task_metrics = {\n", + " \"summarization\": {\"total\": 0, \"success\": 0},\n", + " \"reasoning\": {\"total\": 0, \"success\": 0},\n", + " \"rag\": {\"total\": 0, \"success\": 0},\n", + " \"paraphrasing\": {\"total\": 0, \"success\": 0},\n", + " \"creative_generation\": {\"total\": 0, \"success\": 0},\n", + "}\n", + "\n", + "total_latency = 0\n", + "latencies = []\n", + "\n", + "tracker_inference = OfflineEmissionsTracker(\n", + " country_iso_code=COUNTRY_ISO_CODE, project_name=\"inference\", log_level=\"error\"\n", + ")\n", + "tracker_inference.start()\n", "\n", "for p in prompts:\n", - " question = p[\"prompt\"]\n", - " expected = p.get(\"expected_answer\", None)\n", - " print(f\"\\nTesting Prompt {p['id']}: {question}\")\n", + " start_time = time.time()\n", + " result = generate_rag_response(p[\"prompt\"], task_type=p[\"category\"])\n", + "\n", + " if (\n", + " result[\"answer\"] == \"No relevant information found.\"\n", + " and ENABLE_RECURSIVE_EDITING\n", + " ):\n", + " print(\" [Retrying with lower threshold.]\")\n", + " retry_results = db.similarity_search_with_relevance_scores(p[\"prompt\"], k=8)\n", + "\n", + " if len(retry_results) > 0 and retry_results[0][1] > 0.1:\n", + " retry_context = \"\\n\\n\".join(\n", + " [doc.page_content for doc, _ in retry_results[:5]]\n", + " )[:800]\n", + "\n", + " retry_prompt = f\"\"\"Context: {retry_context}\n", + "\n", + "Question: {p[\"prompt\"]}\n", + "\n", + "Answer:\"\"\"\n", + "\n", + " retry_tokens = tokenizer.encode(retry_prompt, truncation=False)\n", + " if len(retry_tokens) > 450:\n", + " retry_prompt = tokenizer.decode(\n", + " retry_tokens[:450], skip_special_tokens=True\n", + " )\n", + " retry_prompt += f\"\\n\\nQuestion: {p['prompt']}\\n\\nAnswer:\"\n", + "\n", + " retry_answer = llm.invoke(retry_prompt)\n", "\n", - " result = generate_rag_response(question, verbose=False)\n", - " answer = result[\"answer\"]\n", + " if retry_answer and retry_answer != \"No relevant information found.\":\n", + " result[\"answer\"] = retry_answer\n", + " result[\"fixed\"] = True\n", + " result[\"iterations\"] = 1\n", + " result[\"issue_detected\"] = \"no_info\"\n", + "\n", + " end_time = time.time()\n", + "\n", + " latency = end_time - start_time\n", + " total_latency += latency\n", + " latencies.append(latency)\n", + "\n", + " edit_info = \"\"\n", + " if result.get(\"iterations\", 0) > 0:\n", + " status = \"Fixed\" if result.get(\"fixed\") else \"Attempted\"\n", + " issue = result.get(\"issue_detected\", \"unknown\")\n", + " edit_info = f\" [{status}: {issue}, {result['iterations']}x]\"\n", + "\n", + " print(f\"\\nPrompt {p['id']}: {p['prompt']}\")\n", + " print(\n", + " f\"Answer: {result['answer'][:150]}{'...' if len(result['answer']) > 150 else ''}{edit_info}\"\n", + " )\n", + " print(f\"Latency: {latency:.3f}s\")\n", + "\n", + " task_metrics[p[\"category\"]][\"total\"] += 1\n", + " if result[\"answer\"] != \"No relevant information found.\":\n", + " task_metrics[p[\"category\"]][\"success\"] += 1\n", "\n", " results.append(\n", " {\n", " \"id\": p[\"id\"],\n", " \"category\": p[\"category\"],\n", " \"difficulty\": p[\"difficulty\"],\n", - " \"prompt\": question,\n", - " \"answer\": answer,\n", - " \"expected\": expected,\n", - " \"context_used\": len(result[\"context\"]),\n", - " \"top_sources\": result[\"sources\"],\n", + " \"prompt\": p[\"prompt\"],\n", + " \"answer\": result[\"answer\"],\n", + " \"expected\": p.get(\"expected_answer\"),\n", + " \"scores\": result[\"scores\"],\n", + " \"iterations\": result.get(\"iterations\", 0),\n", + " \"issue_detected\": result.get(\"issue_detected\"),\n", + " \"fixed\": result.get(\"fixed\", False),\n", + " \"latency\": latency,\n", " }\n", " )\n", "\n", - " print(f\" Model Answer: {answer}\")\n", - " if expected:\n", - " print(f\" Expected: {expected}\")" + "emissions_inference = tracker_inference.stop()\n", + "energy_inference = (\n", + " tracker_inference._total_energy.kWh\n", + " if hasattr(tracker_inference._total_energy, \"kWh\")\n", + " else 0\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9b9fb6bf", + "metadata": {}, + "source": [ + "# 12. Performance and Carbon Emissions\n", + "\n", + "Calculates and prints performance metrics for all prompts:\n", + "\n", + "- **Latency:** total, average, minimum, and maximum per query. \n", + "- **Carbon emissions and energy consumption:** for model loading, embeddings, and inference. \n", + "- Computes total and per-query values for both CO2 emissions and energy usage.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09bae5de", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"PERFORMANCE METRICS\")\n", + "\n", + "avg_latency = total_latency / len(prompts)\n", + "min_latency = min(latencies)\n", + "max_latency = max(latencies)\n", + "\n", + "print(f\"Total latency: {total_latency:.3f}s\")\n", + "print(f\"Average per query: {avg_latency:.3f}s\")\n", + "print(f\"Min latency: {min_latency:.3f}s\")\n", + "print(f\"Max latency: {max_latency:.3f}s\")\n", + "\n", + "print(\"\\n CARBON EMISSIONS & ENERGY\")\n", + "total_emissions = emissions_loading + emissions_embeddings + emissions_inference\n", + "\n", + "try:\n", + " energy_loading = tracker_loading._total_energy.kWh\n", + " energy_embeddings = tracker_embeddings._total_energy.kWh\n", + " energy_inference_val = energy_inference\n", + " total_energy = energy_loading + energy_embeddings + energy_inference_val\n", + "\n", + " print(f\"Model Loading: {emissions_loading:.6f} kg CO2 | {energy_loading:.6f} kWh\")\n", + " print(\n", + " f\"Embeddings: {emissions_embeddings:.6f} kg CO2 | {energy_embeddings:.6f} kWh\"\n", + " )\n", + " print(\n", + " f\"Inference: {emissions_inference:.6f} kg CO2 | {energy_inference_val:.6f} kWh\"\n", + " )\n", + " print(f\"TOTAL: {total_emissions:.6f} kg CO2 | {total_energy:.6f} kWh\")\n", + " print(\n", + " f\"Per query: {emissions_inference / len(prompts):.6f} kg CO2 | {energy_inference_val / len(prompts):.6f} kWh\"\n", + " )\n", + "except Exception:\n", + " print(f\"Model Loading: {emissions_loading:.6f} kg CO2\")\n", + " print(f\"Embeddings: {emissions_embeddings:.6f} kg CO2\")\n", + " print(f\"Inference: {emissions_inference:.6f} kg CO2\")\n", + " print(f\"TOTAL: {total_emissions:.6f} kg CO2\")\n", + " print(f\"Per query: {emissions_inference / len(prompts):.6f} kg CO2\")" ] } ], diff --git a/rag_and_distilled_model/README.md b/rag_and_distilled_model/README.md index cae7c0c..293d4da 100644 --- a/rag_and_distilled_model/README.md +++ b/rag_and_distilled_model/README.md @@ -31,19 +31,37 @@ JSON file. Rag + Distilled Model/ ├── Apollo11_rag&distilled.ipynb ← Main Jupyter Notebook ├── README.md ← Project documentation +├── model_answers.md ← Generated answers for all prompts +├── emissions.csv ← Energy usage and carbon emissions report └── data/ - ├── apollo11_docs.json ← Apollo 11 text dataset and evaluation prompts + ├── test_data.json ← Apollo 11 text dataset and evaluation prompts └── chroma_db/ ← Auto-created vector database folder (It will appear after you run it) -``` +```` + +> **Note:** `model_answers.md` contains the model’s generated answers for all +prompts, organized by task type (Summarization, Reasoning, RAG, Paraphrasing, +Creative Generation) along with latency information. This file is intended +for inspection, reporting, or downstream evaluation. +> `emissions.csv` contains energy consumption and CO2 emissions data +for model loading, embeddings, and inference for all prompts. --- ## Models Used -* **LaMini-Flan-T5-248M**: It is a Local LLM and it is a distilled version of -Google's Flan-T5, optimized for lightweight text generation tasks. +* **LaMini-Flan-T5-248M**: This model is one of the LaMini-LM model series in +paper "LaMini-LM: A Diverse Herd of Distilled Models from Large-Scale +Instructions". It is a Local LLM and it is a distilled version of +Google's Flan-T5, fine-tuned on LaMini-instruction dataset for instruction +fine-tuning, it's optimized for lightweight text generation tasks. Used here for reasoning, summarization, and RAG response generation. +Its knowledge is distilled from large language models by performing +sentence/offline distillation (Kim and Rush, 2016). A total of +2.58M pairs of instructions and responses was generated using gpt-3.5-turbo +based on several existing resources of prompts, including self-instruct +(Wang et al., 2022), P3 (Sanh et al., 2022), Flan (Longpre et al., 2023) and +Alpaca (Taori et al., 2023). * **all-MiniLM-L6-v2**: It as an Embedding model and it is a compact sentence-transformer model used to convert text chunks into numerical vector embeddings for semantic search and retrieval. @@ -56,8 +74,12 @@ or CPU execution. ## Notes * The ChromaDB folder (data/chroma_db/) is automatically generated when you first -run the document loader. + run the document loader. * You can safely delete it to rebuild embeddings later. * The notebook does not require an external .txt file — all content is inside -the JSON. + the JSON. * The model automatically detects whether to use GPU (torch.cuda.is_available()). +* **Model answers:** The generated responses for all prompts are saved in + `model_answers.md` for easy reference and evaluation. +* Emissions report: Energy consumption and carbon emissions data for model + loading, embeddings, and inference are saved in `emissions.csv`. diff --git a/rag_and_distilled_model/data/test_data.json b/rag_and_distilled_model/data/test_data.json index b4d9139..d36d8d1 100644 --- a/rag_and_distilled_model/data/test_data.json +++ b/rag_and_distilled_model/data/test_data.json @@ -9,7 +9,15 @@ "license": "CC BY-SA 3.0", "note": "Excerpted passages from Wikipedia sections; individual sentences unchanged, some paragraphs omitted for length", "word_count": "approximately 1,400 words", - "language": "English" + "language": "English", + "master_instruction": "You are an expert assistant. Answer only using the provided context. Keep answers clear, concise, and natural. Do not provide unnecessary details or repeat information. Do not mention or refer to the document, context, or data source.", + "task_instructions": { + "summarization": "Summarize or describe information clearly and concisely.", + "reasoning": "Provide short, well-structured answers (2-5 sentences). Use only logical reasoning. Do not add assumptions or outside facts.", + "rag": "Provide precise and direct answers using only the given context. Avoid explanation unless explicitly requested.", + "paraphrasing": "Rewrite the given information in your own words. Preserve meaning and tone without copying phrases directly. The output should read naturally like an original paragraph.", + "creative_generation": "Use the context as inspiration, but do not copy it. Expand or interpret the ideas creatively, producing a short paragraph. Keep the tone natural and imaginative, as if writing your own reflection." + } }, "source_text": "As the descent began, Armstrong and Aldrin found themselves passing landmarks on the surface two or three seconds early, and reported that they were \"long\"; they would land miles west of their target point. Eagle was traveling too fast. The problem could have been mascons—concentrations of high mass in a region or regions of the Moon's crust that contains a gravitational anomaly, potentially altering Eagle's trajectory.\n\nFive minutes into the descent burn, and 6,000 feet (1,800 m) above the surface of the Moon, the LM guidance computer (LGC) distracted the crew with the first of several unexpected 1201 and 1202 program alarms. Inside Mission Control Center, computer engineer Jack Garman told Guidance Officer Steve Bales it was safe to continue the descent, and this was relayed to the crew. The program alarms indicated \"executive overflows\", meaning the guidance computer could not complete all its tasks in real-time and had to postpone some of them. Margaret Hamilton, the Director of Apollo Flight Computer Programming at the MIT Charles Stark Draper Laboratory later recalled: \"To blame the computer for the Apollo 11 problems is like blaming the person who spots a fire and calls the fire department. Actually, the computer was programmed to do more than recognize error conditions. A complete set of recovery programs was incorporated into the software. The software's action, in this case, was to eliminate lower priority tasks and re-establish the more important ones. The computer, rather than almost forcing an abort, prevented an abort. If the computer hadn't recognized this problem and taken recovery action, I doubt if Apollo 11 would have been the successful Moon landing it was.\"\n\nWhen Armstrong again looked outside, he saw that the computer's landing target was in a boulder-strewn area just north and east of a 300-foot-diameter (91 m) crater, so he took semi-automatic control. Throughout the descent, Aldrin called out navigation data to Armstrong, who was busy piloting Eagle. Now 107 feet (33 m) above the surface, Armstrong knew their propellant supply was dwindling and was determined to land at the first possible landing site.\n\nArmstrong found a clear patch of ground and maneuvered the spacecraft towards it. They were now 100 feet (30 m) from the surface, with only 90 seconds of propellant remaining. Lunar dust kicked up by the LM's engine began to impair his ability to determine the spacecraft's motion.\n\nA light informed Aldrin that at least one of the 67-inch (170 cm) probes hanging from Eagle's footpads had touched the surface and he said: \"Contact light!\" Three seconds later, Eagle landed and Armstrong shut the engine down. Aldrin immediately said \"Okay, engine stop.\"\n\nEagle landed at 20:17:40 UTC on Sunday July 20 with 216 pounds (98 kg) of usable fuel remaining. Information available to the crew and mission controllers during the landing showed the LM had enough fuel for another 25 seconds of powered flight before an abort without touchdown would have become unsafe, but post-mission analysis showed that the real figure was probably closer to 50 seconds.\n\nArmstrong acknowledged Aldrin's completion of the post-landing checklist with \"Engine arm is off\", before responding to the CAPCOM, Charles Duke, with the words, \"Houston, Tranquility Base here. The Eagle has landed.\" Duke expressed the relief at Mission Control: \"Roger, Twan—Tranquility, we copy you on the ground. You got a bunch of guys about to turn blue. We're breathing again. Thanks a lot.\"\n\nPreparations for Neil Armstrong and Buzz Aldrin to walk on the Moon began at 23:43 UTC. These took longer than expected; three and a half hours instead of two. Six hours and thirty-nine minutes after landing, Armstrong and Aldrin were ready to go outside, and Eagle was depressurized.\n\nEagle's hatch was opened at 02:39:33. Armstrong initially had some difficulties squeezing through the hatch with his portable life support system (PLSS). At 02:51 Armstrong began his descent to the lunar surface. Climbing down the nine-rung ladder, Armstrong pulled a D-ring to deploy the modular equipment stowage assembly (MESA) folded against Eagle's side and activate the TV camera.\n\nDespite some technical and weather difficulties, black and white images of the first lunar EVA were received and broadcast to at least 600 million people on Earth.\n\nAfter describing the surface dust as \"very fine-grained\" and \"almost like a powder\", at 02:56:15, six and a half hours after landing, Armstrong stepped off Eagle's landing pad and declared: \"That's one small step for [a] man, one giant leap for mankind.\"\n\nArmstrong intended to say \"That's one small step for a man\", but the word \"a\" is not audible in the transmission, and thus was not initially reported by most observers of the live broadcast. When later asked about his quote, Armstrong said he believed he said \"for a man\", and subsequent printed versions of the quote included the \"a\" in square brackets.\n\nAbout seven minutes after stepping onto the Moon's surface, Armstrong collected a contingency soil sample using a sample bag on a stick. Twelve minutes after the sample was collected, he removed the TV camera from the MESA and made a panoramic sweep, then mounted it on a tripod. Aldrin joined Armstrong on the surface. He described the view with the simple phrase: \"Magnificent desolation.\"\n\nArmstrong said moving in the lunar gravity, one-sixth of Earth's, was \"even perhaps easier than the simulations ... It's absolutely no trouble to walk around.\" Aldrin joined him on the surface and tested methods for moving around, including two-footed kangaroo hops. The PLSS backpack created a tendency to tip backward, but neither astronaut had serious problems maintaining balance. The fine soil was quite slippery.\n\nThe astronauts planted the Lunar Flag Assembly containing a flag of the United States on the lunar surface, in clear view of the TV camera. Aldrin remembered, \"Of all the jobs I had to do on the Moon the one I wanted to go the smoothest was the flag raising.\" But the astronauts struggled with the telescoping rod and could only insert the pole about 2 inches (5 cm) into the hard lunar surface. Before Aldrin could take a photo of Armstrong with the flag, President Richard Nixon spoke to them through a telephone-radio transmission, which Nixon called \"the most historic phone call ever made from the White House.\"\n\nThey deployed the EASEP, which included a Passive Seismic Experiment Package used to measure moonquakes and a retroreflector array used for the lunar laser ranging experiment. Then Armstrong walked 196 feet (60 m) from the LM to take photographs at the rim of Little West Crater while Aldrin collected two core samples. He used the geologist's hammer to pound in the tubes—the only time the hammer was used on Apollo 11—but was unable to penetrate more than 6 inches (15 cm) deep.\n\nThe astronauts then collected rock samples using scoops and tongs on extension handles. Many of the surface activities took longer than expected, so they had to stop documenting sample collection halfway through the allotted 34 minutes. Aldrin shoveled 6 kilograms (13 lb) of soil into the box of rocks to pack them in tightly. Two types of rocks were found in the geological samples: basalt and breccia.\n\nWhile on the surface, Armstrong uncovered a plaque mounted on the LM ladder, bearing two drawings of Earth, an inscription, and signatures of the astronauts and President Nixon. The inscription read: \"Here men from the planet Earth first set foot upon the Moon July 1969, A. D. We came in peace for all mankind.\"\n\nMission Control used a coded phrase to warn Armstrong his metabolic rates were high, and that he should slow down. As metabolic rates remained generally lower than expected for both astronauts throughout the walk, Mission Control granted the astronauts a 15-minute extension.\n\nAldrin entered Eagle first. With some difficulty the astronauts lifted film and two sample boxes containing 21.55 kilograms (47.5 lb) of lunar surface material to the LM hatch using a flat cable pulley device called the Lunar Equipment Conveyor (LEC). Armstrong then jumped onto the ladder's third rung, and climbed into the LM. After transferring to LM life support, the explorers lightened the ascent stage for the return to lunar orbit by tossing out their PLSS backpacks, lunar overshoes, an empty Hasselblad camera, and other equipment. The hatch was closed again at 05:11:13. They then pressurized the LM and settled down to sleep.", @@ -124,16 +132,60 @@ "prompt": "How much usable fuel remained when Eagle landed, and how many seconds of powered flight did this represent?", "type": "complex_retrieval", "expected_answer": "216 pounds (98 kg); about 25 seconds according to initial estimates, but post-mission analysis showed closer to 50 seconds" + }, + { + "id": 16, + "category": "paraphrasing", + "difficulty": "easy", + "prompt": "In your own words, describe what happened when the computer alarms appeared during the landing.", + "type": "text_based_retelling" + }, + { + "id": 17, + "category": "paraphrasing", + "difficulty": "medium", + "prompt": "Explain how Armstrong's decisions, actions, and teamwork during the descent contributed to the mission's success.", + "type": "text_based_synthesis" + }, + { + "id": 18, + "category": "paraphrasing", + "difficulty": "medium", + "prompt": "Describe how the astronauts collected and handled Moon samples using your own words.", + "type": "text_based_retelling" + }, + { + "id": 19, + "category": "creative_generation", + "difficulty": "easy", + "prompt": "Imagine being one of the people in Mission Control. How would you feel while watching the landing?", + "type": "interpretive_generation" + }, + { + "id": 20, + "category": "creative_generation", + "difficulty": "medium", + "prompt": "Write a short paragraph about what the Moon landing might have shown about human courage.", + "type": "interpretive_generation" + }, + { + "id": 21, + "category": "creative_generation", + "difficulty": "medium", + "prompt": "Describe how life on Earth might have changed after people saw the first Moon landing.", + "type": "interpretive_generation" } ], "evaluation_notes": { - "testing_approach": "All 15 prompts should be tested across all models to ensure a fair comparison.", + "testing_approach": "All prompts should be tested across all models to ensure a fair comparison, following this order: master instruction first, then the task-specific instruction, and then the prompts within that task.", "prompt_categories": { "summarization": "Prompts 1-5 test condensing and extracting key information", "reasoning": "Prompts 6-10 test analysis, inference, and logical connections", - "rag": "Prompts 11-15 test retrieval accuracy from source text" + "rag": "Prompts 11-15 test retrieval accuracy from source text", + "paraphrasing": "Prompts 16-18 test text-based retelling in model's own words", + "creative_generation": "Prompts 19-21 test interpretive and imaginative responses inspired by the text" }, - "note": "Some prompts may be more challenging for smaller models, but attempting all prompts provides complete evaluation data" + "note": "Some prompts may be more challenging for smaller models, but attempting all prompts provides complete evaluation data. Paraphrasing and creative generation prompts assess generation quality beyond factual accuracy." } } diff --git a/rag_and_distilled_model/emissions.csv b/rag_and_distilled_model/emissions.csv new file mode 100644 index 0000000..643d602 --- /dev/null +++ b/rag_and_distilled_model/emissions.csv @@ -0,0 +1,4 @@ +timestamp,project_name,run_id,experiment_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,gpu_energy,ram_energy,energy_consumed,water_consumed,country_name,country_iso_code,region,cloud_provider,cloud_region,os,python_version,codecarbon_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,on_cloud,pue,wue +2025-11-16T01:28:25,model_loading,f82aeab2-c38a-4371-be3a-1997fbb47469,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,13.77340804299996,0.0001749558997069,1.2702440758365311e-05,42.5,27.76085465118873,10.0,0.0001624449990458,0.0001061089737759,3.82215121055555e-05,0.0003067754849273,0.0,Egypt,EGY,,,,Linux-6.6.105+-x86_64-with-glibc2.35,3.12.12,3.1.0,2,Intel(R) Xeon(R) CPU @ 2.00GHz,1,1 x Tesla T4,,,12.671436309814451,machine,N,1.0,0.0 +2025-11-16T01:28:36,codecarbon,b737d2be-858f-4f13-8fdd-44a38c568be5,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,9.368741295000008,0.000119456445036,1.2750532998468423e-05,42.5,28.060567855432705,10.0,0.0001105020664916,7.295922503400005e-05,2.599897197500004e-05,0.0002094602635006,0.0,Egypt,EGY,,,,Linux-6.6.105+-x86_64-with-glibc2.35,3.12.12,3.1.0,2,Intel(R) Xeon(R) CPU @ 2.00GHz,1,1 x Tesla T4,,,12.671436309814451,machine,N,1.0,0.0 +2025-11-16T01:29:32,inference,4903bd93-05fb-4e13-9c04-6b84af574e28,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,53.55844873299998,0.0008210196470009611,1.5329414245993477e-05,42.5,44.75440213564876,10.0,0.0006321736813256934,0.0006586960825119999,0.00014874273717222227,0.0014396125010099157,0.0,Egypt,EGY,,,,Linux-6.6.105+-x86_64-with-glibc2.35,3.12.12,3.1.0,2,Intel(R) Xeon(R) CPU @ 2.00GHz,1,1 x Tesla T4,,,12.671436309814453,machine,N,1.0,0.0 diff --git a/rag_and_distilled_model/model_answers.md b/rag_and_distilled_model/model_answers.md new file mode 100644 index 0000000..7b8ea81 --- /dev/null +++ b/rag_and_distilled_model/model_answers.md @@ -0,0 +1,178 @@ +# NLP Task Results + +## Summarization Tasks + +**Prompt 1:** Summarize the main events during the Apollo 11 lunar landing in +3 sentences. +**Answer:** The computer prevented an abort during the Apollo 11 lunar landing, +and a complete set of recovery programs were incorporated into the software. +**Latency:** 5.328s + +**Prompt 2:** What were the main challenges Armstrong faced while landing the +Eagle? +**Answer:** Armstrong initially had some difficulties squeezing through the +hatch with his portable life support system (PLSS). +**Latency:** 1.405s + +**Prompt 3:** Describe the activities the astronauts performed on the lunar +surface. +**Answer:** The astronauts planted the Lunar Flag Assembly containing a flag +of the United States on the lunar surface. +**Latency:** 6.323s + +**Prompt 4:** Explain what scientific equipment the astronauts deployed on +the Moon. +**Answer:** The astronauts deployed the EASEP, which included a Passive Seismic +Experiment Package used to measure moonquakes and a retroreflector array used +for the lunar laser ranging experiment. +**Latency:** 2.617s + +**Prompt 5:** Compare the planned timeline for the lunar surface operations +with what actually happened. +**Answer:** The planned timeline for the lunar surface operations was three +and a half hours instead of two, which took longer than expected. +**Latency:** 0.581s + +--- + +## Reasoning Tasks + +**Prompt 6:** Why did the computer alarms (1201 and 1202) occur during the +descent? +**Answer:** The computer alarms (1201 and 1202) occurred during the descent +to indicate "executive overflows", meaning the guidance computer could not +complete all its tasks in real-time and had to postpone some of them. +**Latency:** 1.096s + +**Prompt 7:** What would have happened if Armstrong had not taken manual +control during the landing? +**Answer:** If Armstrong had not taken manual control during the landing, +the computer's landing target would have been in a boulder-strewn area +just north and east of a 300-foot-diameter (91 m) crater. +**Latency:** 3.110s + +**Prompt 8:** Why did Armstrong's famous quote become controversial? +**Answer:** Armstrong's famous quote became controversial because the word +"a" was not audible in the transmission and was not initially reported by +most observers of the live broadcast. +**Latency:** 1.471s + +**Prompt 9:** Analyze how the fuel situation during landing reflects the +risk management challenges of the mission. +**Answer:** The fuel situation during landing reflects the risk management +challenges of the mission as the LM had enough fuel for another 25 seconds +of powered flight before an abort without touchdown would have become unsafe, +but post-mission analysis showed that the real figure was probably closer to +50 seconds. +**Latency:** 5.566s + +**Prompt 10:** Based on the text, what does Margaret Hamilton's statement reveal +about the Apollo Guidance Computer's design philosophy? +**Answer:** Margaret Hamilton's statement reveals that the Apollo Guidance +Computer was programmed to do more than recognize error conditions and +eliminate lower priority tasks and re-establish the more important ones. +**Latency:** 0.977s + +--- + +## RAG Tasks + +**Prompt 11:** At what time (UTC) did Eagle land on the Moon? +**Answer:** Eagle landed at 20:17:40 UTC on Sunday July 20. +**Latency:** 0.519s + +**Prompt 12:** How much lunar material did the astronauts collect? +**Answer:** The astronauts collected 21.55 kilograms (47.5 lb) of lunar surface +material. +**Latency:** 0.580s + +**Prompt 13:** What was Armstrong's famous first words when stepping on the +Moon? +**Answer:** Armstrong's famous first words when stepping on the Moon were +"Here men from the planet Earth first set foot upon the Moon July 1969, +A. D. We came in peace for all mankind." +**Expected:** That's one small step for [a] man, one giant leap for mankind +**Latency:** 0.979s + +**Prompt 14:** What scientific instruments were included in the EASEP package? +**Answer:** The EASEP package included a Passive Seismic Experiment Package +used to measure moonquakes and a retroreflector array used for the lunar laser +ranging experiment. +**Expected:** Passive Seismic Experiment Package and retroreflector array +**Latency:** 0.897s + +**Prompt 15:** How much usable fuel remained when Eagle landed, and how many +seconds of powered flight did this represent? +**Answer:** The LM had enough fuel for another 25 seconds of powered flight +before an abort without touchdown would have become unsafe, but post-mission +analysis showed that the real figure was probably closer to 50 seconds. +**Expected:** 216 pounds (98 kg); about 25 seconds according to initial +estimates, but post-mission analysis showed closer to 50 seconds +**Latency:** 1.226s + +--- + +## Paraphrasing Tasks + +**Prompt 16:** In your own words, describe what happened when the computer +alarms appeared during the landing. +**Answer:** When the computer alarms appeared during the Apollo 11 landing, +the crew was distracted by several unexpected 1201 and 1202 program alarms. +The alarms indicated "executive overflows", meaning the guidance computer +could not complete all its tasks in real-time and had to postpone some of +them. Margaret Hamilton, the Director of Apollo Flight Computer Programming +at the MIT Charles Stark Draper Laboratory, later recalled that to blame the +computer for the Apollo 11 problems is like blaming the person knew their +propellant supply was dwindling and was determined to land at the first +possible landing site. The computer, rather than almost forcing an abort, +prevented an abort. If the computer hadn't recognized this problem and took +recovery action, I doubt if Apollo 11 would have been the successful Moon +landing it was. +**Latency:** 4.525s + +**Prompt 17:** Explain how Armstrong's decisions, actions, and teamwork +during the descent contributed to the mission's success. +**Answer:** Armstrong's decisions, actions, and teamwork during the descent +contributed to the mission's success by ensuring that the Eagle had landed +safely on the Moon's surface. Aldrin called out navigation data to Armstrong, +who was busy piloting the Eagle. This helped to ensure that the Eagle's +propellant supply was dwindling and that Armstrong was able to safely land +on the Moon's surface. +**Latency:** 2.025s + +**Prompt 18:** Describe how the astronauts collected and handled Moon samples +using your own words. +**Answer:** The astronauts collected a contingency soil sample using a sample +bag on a stick, removed the TV camera from the MESA and made a panoramic sweep, +mounted it on a tripod, and then collected rock samples using scoops and tongs +on extension handles. +**Latency:** 6.354s + +--- + +## Creative Generation Tasks + +**Prompt 19:** Imagine being one of the people in Mission Control. How would +you feel while watching the landing? +**Answer:** I'm sorry, but as an AI language model, I don't have the ability +to feel emotions. However, based on the given context, it is possible that +one of the people in Mission Control would feel anxious or nervous while +watching the landing. +**Latency:** 1.143s + +**Prompt 20:** Write a short paragraph about what the Moon landing might have +shown about human courage. +**Answer:** The Moon landing on July 20, 1969, was a significant moment in +human history as it marked the first time humans ever set foot on the moon. +It was a moment of immense pride and admiration for the bravery and +determination of Neil Armstrong and Buzz Aldrin, as well as a reminder +of the importance of standing up for what is right, even in the face of +adversity. +**Latency:** 2.631s + +**Prompt 21:** Describe how life on Earth might have changed after people +saw the first Moon landing. +**Answer:** Life on Earth might have changed after people saw the first Moon +landing due to the dwindling propellant supply and the determination of +Armstrong and Aldrin to land at the first possible landing site. +**Latency:** 1.017s