Project_7_AI_Web_Crawler/webpage_query_gemini_ui.py at main · kamalviewcode-spec/Project_7_AI_Web_Crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# =====================================================================
# FILE: webpage_query_gemini_ui.py
# =====================================================================
# Kamal AI Web Crawler & Question Answering — Google Gemini Edition
# =====================================================================
#
# PURPOSE:
#   Same workflow as webpage_query_groq_ui.py, but uses Google's Gemini
#   models as the LLM instead of Groq. Uses ChatGoogleGenerativeAI from
#   langchain_google_genai, which reads GOOGLE_API_KEY from the .env file.
#
# WORKFLOW:
#   1. User enters a website URL in the "Scrape & Index" tab
#   2. Click "Discover Links" → UI finds all internal links on that page
#   3. User selects which links to crawl using the multiselect dropdown
#   4. Click "Crawl & Index" → real-time scraping with live log output
#   5. Scraped content is cleaned, chunked, embedded → FAISS vector store
#   6. Switch to "Ask Questions" tab → type a question → get AI answer
#
# TECH STACK:
#   - UI:           Gradio 6.x (streaming generators for real-time progress)
#   - LLM:          Google Gemini API (gemini-2.0-flash, gemini-1.5-pro, gemini-1.5-flash)
#   - Embeddings:   HuggingFace sentence-transformers (runs locally)
#   - Vector Store: FAISS (in-memory, fast similarity search)
#   - RAG:          LangChain LCEL (4 chain types: stuff/map_reduce/refine/map_rerank)
#
# GOOGLE GEMINI API:
#   - Requires GOOGLE_API_KEY in your .env file
#   - Uses langchain_google_genai.ChatGoogleGenerativeAI
#   - Models available:
#       gemini-2.0-flash        → Latest, fastest Gemini 2.0 (recommended)
#       gemini-1.5-pro          → Most capable Gemini 1.5, 1M context window
#       gemini-1.5-flash        → Fast and efficient Gemini 1.5
#       gemini-1.5-flash-8b     → Smallest and fastest, high-volume tasks
#
# USAGE:
#   python webpage_query_gemini_ui.py
#   Then open http://localhost:7864 in your browser
#   (Port 7864 avoids conflict with Groq:7860, DeepSeek:7861,
#    Claude:7862, OpenAI:7863)
# =====================================================================


# ─────────────────────────────────────────────────────────────────────
# IMPORTS
# ─────────────────────────────────────────────────────────────────────

import gradio as gr                         # Web UI framework
import requests                             # HTTP requests for web scraping
from bs4 import BeautifulSoup               # HTML parser
import tldextract                           # Extract domain from URL
from urllib.parse import urljoin, urlparse  # URL manipulation utilities
import html2text                            # Convert HTML to Markdown
import re                                   # Regular expressions for text cleaning
import os                                   # OS utilities (env vars, file paths)
from typing import Generator               # Type hint for generator functions

# LangChain vector store + embeddings
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# LangChain text splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

# LangChain core: prompts and LCEL building blocks
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser   # Converts LLM output → plain string
from langchain_core.runnables import RunnablePassthrough    # Passes input through unchanged

# ── Google Gemini LLM integration ─────────────────────────────────────
# ChatGoogleGenerativeAI reads GOOGLE_API_KEY from the environment.
# Connects to Google's Generative AI API (Gemini model family).
from langchain_google_genai import ChatGoogleGenerativeAI

# Load GOOGLE_API_KEY and other secrets from the .env file
from dotenv import load_dotenv
load_dotenv()


# =====================================================================
# SECTION 1: WEB SCRAPING FUNCTIONS
# =====================================================================
# Identical to webpage_query_groq_ui.py — scraping logic is LLM-independent.


def discover_links(base_url: str) -> tuple[list[str], str]:
    """
    Scrape only the base URL and collect all internal hyperlinks found on it.
    This is the "discovery" step — linked pages are NOT crawled yet.
    The user can then choose which discovered links to crawl.

    Args:
        base_url (str): The starting URL to inspect for links.

    Returns:
        links (list[str]): Unique internal URLs found on the page.
        log   (str):       Status message shown in the UI.
    """
    links = []
    log   = ""

    try:
        # Fetch the HTML of the base URL with a 10-second timeout
        response = requests.get(base_url.strip(), timeout=10)
        response.raise_for_status()  # Raise for 4xx / 5xx HTTP errors

        soup        = BeautifulSoup(response.text, "html.parser")
        base_domain = tldextract.extract(base_url).domain  # e.g. "google" from "ai.google.dev"

        # Loop through every anchor tag that has an href attribute
        for link_tag in soup.find_all("a", href=True):
            href     = link_tag["href"]
            full_url = urljoin(base_url, href)   # Convert relative URL → absolute
            parsed   = urlparse(full_url)

            # Keep only internal links that haven't been seen yet
            if (base_domain in parsed.netloc
                    and full_url != base_url
                    and "#" not in full_url
                    and full_url not in links):
                links.append(full_url)

        log = (
            f"✅ Base URL scraped successfully.\n"
            f"🔗 Found {len(links)} internal link(s) on this page."
        )

    except Exception as e:
        log = f"❌ Failed to scrape base URL: {e}"

    return links, log


def clean_scraped_text(text: str) -> str:
    """
    Normalize raw scraped text by collapsing redundant whitespace.

    Transformations:
      1. Multiple spaces/tabs        → single space
      2. Multiple consecutive blanks → one blank line
      3. Trailing whitespace on lines → removed
      4. Leading whitespace on lines  → removed

    Args:
        text (str): Raw scraped text.

    Returns:
        str: Cleaned, normalized text.
    """
    text = re.sub(r"[ \t]+",    " ",    text)   # Step 1
    text = re.sub(r"\n\s*\n+", "\n\n", text)   # Step 2
    text = re.sub(r"[ \t]+\n",  "\n",   text)   # Step 3
    text = re.sub(r"\n[ \t]+",  "\n",   text)   # Step 4
    return text.strip()


def scrape_urls_streaming(urls: list[str]) -> Generator:
    """
    Scrape a list of URLs and yield real-time progress tuples.

    This is a Gradio-compatible streaming generator:
      - Each `yield` sends an immediate UI update (log + status).
      - The final `yield` includes the fully extracted Markdown text.

    Only meaningful HTML tags (p, h1–h3, code) are extracted;
    navigation, sidebars, and footers are skipped.

    Args:
        urls (list[str]): Ordered list of URLs to scrape.

    Yields:
        Tuple[str, str, str]: (log_text, status_label, cleaned_markdown)
          - log_text:        Full running log (grows with each yield)
          - status_label:    Short string shown in the status indicator
          - cleaned_markdown: Non-empty only on the final yield
    """
    # Configure html2text: convert HTML → clean Markdown
    h              = html2text.HTML2Text()
    h.ignore_links = False   # Keep hyperlinks in output
    h.body_width   = 0       # No line-wrapping

    # Tags to extract — excludes nav bars, scripts, ads
    content_tags = ["p", "h1", "h2", "h3", "code"]

    visited            = set()
    extracted_markdown = ""
    log                = ""

    for current_url in urls:
        if current_url in visited:
            log += f"⏭️  Already visited: {current_url}\n"
            yield log, "⏳ Scraping...", ""
            continue

        try:
            log += f"\n🔎 Scraping: {current_url}\n"
            yield log, "⏳ Scraping...", ""   # Emit BEFORE the request (live feedback)

            response = requests.get(current_url, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, "html.parser")
            visited.add(current_url)

            # Build focused HTML: only desired tags, skip nav/sidebar parents
            isolated_html = ""
            for tag in soup.find_all(content_tags):
                if tag.find_parent(
                    class_=lambda c: c and any(
                        x in c.lower() for x in ["menu", "sidebar", "nav", "footer"]
                    )
                ):
                    continue
                isolated_html += tag.prettify()

            # Convert extracted HTML fragment to clean Markdown
            if isolated_html.strip():
                markdown_text       = h.handle(isolated_html)
                extracted_markdown += f"\n\n--- Page: {current_url} ---\n\n"
                extracted_markdown += markdown_text
                log += f"   ✅ Content extracted ({len(markdown_text)} chars)\n"
            else:
                log += f"   ⚠️  No extractable content found.\n"

        except Exception as e:
            log += f"   ❌ Failed: {e}\n"

        yield log, "⏳ Scraping...", ""

    # ── Post-scraping: clean and save ────────────────────────────────
    log += "\n🧹 Cleaning extracted text...\n"
    yield log, "⏳ Cleaning...", ""

    cleaned_text = clean_scraped_text(extracted_markdown)

    # Save to a separate file — does not overwrite other LLM versions
    with open("scraped_data_gemini.txt", "w", encoding="utf-8") as f:
        f.write(cleaned_text)
    log += "💾 Saved to scraped_data_gemini.txt\n"

    # Final yield — pass the full cleaned text back via the 3rd output slot
    yield log, "✅ Scraping complete!", cleaned_text


# =====================================================================
# SECTION 2: VECTOR STORE BUILDER
# =====================================================================
# Identical to webpage_query_groq_ui.py — embeddings are LLM-independent.


def build_vector_store(text: str, chunk_size: int = 1500, chunk_overlap: int = 200):
    """
    Convert a large text string into a searchable FAISS vector store.

    Steps:
      1. Split text into overlapping chunks (RecursiveCharacterTextSplitter).
      2. Embed each chunk with HuggingFace all-MiniLM-L6-v2 (runs locally).
      3. Build a FAISS index for fast nearest-neighbour similarity search.

    Args:
        text          (str): Full cleaned text to index.
        chunk_size    (int): Max characters per chunk (default: 1500).
        chunk_overlap (int): Characters shared between adjacent chunks (default: 200).

    Returns:
        vectorstore (FAISS): Ready for similarity search.
        num_chunks  (int):   Number of chunks created.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size    = chunk_size,
        chunk_overlap = chunk_overlap
    )
    chunks = splitter.split_text(text)

    # all-MiniLM-L6-v2: fast, 80 MB, excellent for semantic similarity
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    vectorstore = FAISS.from_texts(chunks, embedding_model)
    return vectorstore, len(chunks)


# =====================================================================
# SECTION 3: PROMPT TEMPLATES
# =====================================================================
# Same prompts as webpage_query_groq_ui.py — prompts are LLM-agnostic.
# Gemini models handle all four chain types with the same templates.


# ── Stuff: all retrieved chunks packed into one prompt ────────────
CUSTOM_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant. Use the following extracted content to answer the question.
Answer in a clear, factual, and concise way. If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}

Answer:
"""
)

# ── Map-Reduce MAP step: each chunk answered independently ─────────
MAP_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Use the following context to answer the question:

{context}

Question: {question}
Answer:
"""
)

# ── Map-Reduce COMBINE step: all partial answers merged ────────────
COMBINE_PROMPT = PromptTemplate(
    input_variables=["summaries", "question"],
    template="""
The following are answers extracted from different document sections:
{summaries}

Given the above, provide a final, concise answer to the question:

Question: {question}
Answer:
"""
)

# ── Refine INITIAL step: first chunk generates starting answer ─────
QUESTION_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are given a document and a question. Use the document to answer.

Document:
{context}

Question: {question}
Answer:
"""
)

# ── Refine REFINEMENT step: subsequent chunks may improve answer ───
REFINE_PROMPT = PromptTemplate(
    input_variables=["existing_answer", "context", "question"],
    template="""
We have an existing answer: {existing_answer}

Here is another document section that may help refine it:
{context}

Question: {question}

Update the answer if this document provides new useful information.
If not, keep the original answer unchanged.

Refined Answer:
"""
)

# ── Map-Rerank: scores each chunk, best-scoring answer wins ────────
RERANK_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are given a document and a question.

Document:
{context}

Question: {question}

Provide:
1. An answer to the question (if the document is relevant).
2. A relevance score between 0 and 10 (higher means more relevant).

Format:
Answer: <your answer here>
Score: <number between 0 and 10>
"""
)


# =====================================================================
# SECTION 4: QA CHAIN BUILDER  (LCEL — Google Gemini edition)
# =====================================================================
# Same LCEL logic as webpage_query_groq_ui.py.
# The ONLY difference: LLM is ChatGoogleGenerativeAI instead of ChatGroq.


def _format_docs(docs: list) -> str:
    """Merge a list of Document objects into one context string."""
    return "\n\n".join(doc.page_content for doc in docs)


def _run_stuff_chain(retriever, llm, question: str) -> tuple[str, list]:
    """
    STUFF strategy — all retrieved chunks are concatenated into one
    context block and sent to Gemini in a single prompt.

    Best for: short pages, quick answers, small number of retrieved chunks.
    Gemini 2.0 Flash handles this especially well due to its large context window.
    """
    docs   = retriever.invoke(question)
    chain  = CUSTOM_PROMPT | llm | StrOutputParser()
    answer = chain.invoke({
        "context":  _format_docs(docs),
        "question": question
    })
    return answer, docs


def _run_map_reduce_chain(retriever, llm, question: str) -> tuple[str, list]:
    """
    MAP-REDUCE strategy — each chunk is answered by Gemini independently
    (map), then all partial answers are combined into one final answer (reduce).

    Best for: many pages / large volumes of retrieved text.
    """
    docs = retriever.invoke(question)

    # MAP: Gemini answers each chunk individually
    map_chain       = MAP_PROMPT | llm | StrOutputParser()
    partial_answers = [
        map_chain.invoke({"context": doc.page_content, "question": question})
        for doc in docs
    ]

    # REDUCE: combine all partial answers into one final answer
    combine_chain = COMBINE_PROMPT | llm | StrOutputParser()
    answer = combine_chain.invoke({
        "summaries": "\n\n".join(partial_answers),
        "question":  question
    })
    return answer, docs


def _run_refine_chain(retriever, llm, question: str) -> tuple[str, list]:
    """
    REFINE strategy — first chunk produces an initial answer; each
    subsequent chunk may improve it if it contains new information.

    Best for: comprehensive, high-quality answers over long documents.
    """
    docs = retriever.invoke(question)
    if not docs:
        return "No relevant documents found.", []

    # Initial answer from the first chunk
    init_chain = QUESTION_PROMPT | llm | StrOutputParser()
    answer = init_chain.invoke({
        "context":  docs[0].page_content,
        "question": question
    })

    # Iteratively refine using each subsequent chunk
    refine_chain = REFINE_PROMPT | llm | StrOutputParser()
    for doc in docs[1:]:
        answer = refine_chain.invoke({
            "existing_answer": answer,
            "context":         doc.page_content,
            "question":        question
        })
    return answer, docs


def _run_map_rerank_chain(retriever, llm, question: str) -> tuple[str, list]:
    """
    MAP-RERANK strategy — Gemini scores each chunk independently (0–10).
    The answer from the highest-scoring chunk is returned.

    Best for: finding the single most relevant document chunk.
    """
    docs         = retriever.invoke(question)
    rerank_chain = RERANK_PROMPT | llm | StrOutputParser()
    best_answer  = "No relevant document found."
    best_score   = -1

    for doc in docs:
        raw = rerank_chain.invoke({
            "context":  doc.page_content,
            "question": question
        })

        # Parse structured "Answer: ... Score: ..." response
        score_match  = re.search(r"Score:\s*(\d+)",              raw)
        answer_match = re.search(r"Answer:\s*(.+?)(?=Score:|$)", raw, re.DOTALL)

        score  = int(score_match.group(1))      if score_match  else 0
        answer = answer_match.group(1).strip()  if answer_match else raw.strip()

        if score > best_score:
            best_score  = score
            best_answer = answer

    return best_answer, docs


def run_qa(vectorstore, model_name: str, chain_type: str, question: str) -> tuple[str, list]:
    """
    Dispatcher: initialises the Gemini LLM, selects the correct LCEL
    chain strategy, and returns the answer with its source documents.

    ── Google Gemini LLM setup ───────────────────────────────────────
    ChatGoogleGenerativeAI reads GOOGLE_API_KEY from the environment.

    Available models:
      gemini-2.0-flash       Latest Gemini 2.0 — fastest, best for most tasks
      gemini-1.5-pro         Most capable Gemini 1.5, 1M token context window
      gemini-1.5-flash       Balanced speed and quality (Gemini 1.5)
      gemini-1.5-flash-8b    Smallest Gemini — high-volume, simple queries
    ─────────────────────────────────────────────────────────────────

    Chain Types:
      "stuff"       All chunks in one prompt (fastest)
      "map_reduce"  Per-chunk answers combined (handles more text)
      "refine"      Iteratively improved answer (highest quality)
      "map_rerank"  Best-scored chunk wins (most targeted)

    Args:
        vectorstore (FAISS): Indexed vector store to search.
        model_name  (str):   Gemini model name.
        chain_type  (str):   RAG strategy name.
        question    (str):   User's question.

    Returns:
        answer      (str):  Gemini's answer.
        source_docs (list): Retrieved Document objects used as context.
    """
    # ── Initialise Gemini via langchain_google_genai ──────────────────
    # GOOGLE_API_KEY is read automatically from the environment.
    # temperature=0 for deterministic, factual responses.
    # convert_system_message_to_human=True ensures system prompts are
    # handled correctly for Gemini (which uses a different message format).
    llm = ChatGoogleGenerativeAI(
        model                        = model_name,
        temperature                  = 0,
        convert_system_message_to_human = True   # Required for Gemini compatibility
    )

    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

    dispatch = {
        "stuff":      _run_stuff_chain,
        "map_reduce": _run_map_reduce_chain,
        "refine":     _run_refine_chain,
        "map_rerank": _run_map_rerank_chain,
    }
    if chain_type not in dispatch:
        raise ValueError(f"Unknown chain_type '{chain_type}'. Choose: {list(dispatch)}")

    return dispatch[chain_type](retriever, llm, question)


# =====================================================================
# SECTION 5: GRADIO EVENT HANDLERS
# =====================================================================


def on_discover_links(url: str):
    """
    Event handler for the 'Discover Links' button.

    Scrapes the base URL and returns all discovered internal links
    as Dropdown choices (all pre-selected by default).

    Args:
        url (str): URL entered by the user.

    Returns:
        Tuple[gr.update, str]: Updated Dropdown + status message.
    """
    if not url.strip():
        return gr.update(choices=[], value=[]), "⚠️ Please enter a valid URL first."

    links, log = discover_links(url.strip())
    return gr.update(choices=links, value=links), log


def on_scrape_and_index(url: str, selected_links: list, max_pages: int):
    """
    Streaming event handler for the 'Crawl & Index' button.

    Yields real-time log updates as each page is scraped, then builds
    the FAISS vector store and stores it in Gradio State.

    Args:
        url            (str):  Base URL (always included).
        selected_links (list): Links chosen in the multiselect dropdown.
        max_pages      (int):  Maximum total pages to crawl.

    Yields:
        Tuple[str, str, object, int]:
            (scrape_log, status_text, vectorstore_or_None, chunks_count)
    """
    if not url.strip():
        yield "⚠️ No URL provided. Please enter a URL and discover links first.", \
              "❌ No URL", None, 0
        return

    # Deduplicate and cap at max_pages
    all_urls = list(dict.fromkeys([url.strip()] + (selected_links or [])))[:max_pages]

    log = f"📋 Will crawl {len(all_urls)} URL(s):\n"
    for u in all_urls:
        log += f"   • {u}\n"
    log += "\n"

    yield log, "⏳ Starting scrape...", None, 0

    # ── Relay streaming scraper progress to the UI ────────────────────
    cleaned_text = ""
    for scrape_log_update, scrape_status, final_text in scrape_urls_streaming(all_urls):
        combined_log = log + scrape_log_update
        yield combined_log, scrape_status, None, 0
        if final_text:
            cleaned_text = final_text

    if not cleaned_text.strip():
        yield combined_log + "\n❌ No content extracted. Check the URL and try again.", \
              "❌ Nothing scraped", None, 0
        return

    # ── Build FAISS vector store ──────────────────────────────────────
    combined_log += "\n🧠 Building vector store — embedding text chunks...\n"
    yield combined_log, "⏳ Building index...", None, 0

    try:
        vectorstore, num_chunks = build_vector_store(cleaned_text)
        combined_log += (
            f"✅ Vector store ready! {num_chunks} chunks indexed.\n\n"
            f"🎉 All done! Switch to the '💬 Ask Questions' tab to query the content."
        )
        yield combined_log, f"✅ Indexed {num_chunks} chunks — Ready!", vectorstore, num_chunks

    except Exception as e:
        combined_log += f"\n❌ Indexing error: {e}"
        yield combined_log, f"❌ Error: {e}", None, 0


def on_ask_question(question: str, model_name: str, chain_type: str, vectorstore):
    """
    Event handler for the 'Get Answer' button.

    Runs the user's question through the Gemini-powered RAG chain and
    returns the answer plus the retrieved source document chunks.

    Args:
        question    (str):   The user's question.
        model_name  (str):   Gemini model name.
        chain_type  (str):   RAG chain strategy.
        vectorstore (FAISS): Indexed vector store from Gradio State.

    Returns:
        Tuple[str, str]: (answer_text, formatted_source_chunks)
    """
    if vectorstore is None:
        return (
            "⚠️ No content indexed yet. Please scrape and index a website first.",
            ""
        )
    if not question.strip():
        return "⚠️ Please enter a question.", ""

    try:
        answer, source_docs = run_qa(vectorstore, model_name, chain_type, question.strip())

        if source_docs:
            sources_text = ""
            for i, doc in enumerate(source_docs, start=1):
                # Show first 400 chars of each retrieved chunk
                preview = doc.page_content[:400].strip()
                sources_text += f"─── Source {i} ───\n{preview}\n\n"
        else:
            sources_text = "No source documents returned."

        return answer, sources_text

    except Exception as e:
        return f"❌ Error running QA chain: {e}", ""


# =====================================================================
# SECTION 6: GRADIO UI LAYOUT
# =====================================================================


def build_ui():
    """
    Construct and return the Gradio Blocks application.

    Identical layout to webpage_query_groq_ui.py with these differences:
      - Header colour: Google blue accent to distinguish from other UIs
      - Badge mentions Google / Gemini
      - Model dropdown lists Gemini model IDs
      - Port 7864

    Returns:
        gr.Blocks: The fully configured Gradio application.
    """

    # ── Professional CSS — White / Black / Google Blue (Gemini edition) ──
    PRO_CSS = """

    /* ── Base: clean white page ── */
    body, .gradio-container {
        background: #f8fafc !important;
        font-family: 'Inter', 'Segoe UI', Arial, sans-serif !important;
        color: #111827 !important;
    }

    /* ── Hero header — Google blue gradient ── */
    .app-header {
        background: linear-gradient(135deg, #1a73e8 0%, #1557b0 50%, #0d47a1 100%);
        border-radius: 10px;
        padding: 26px 32px;
        margin-bottom: 4px;
        box-shadow: 0 2px 12px rgba(26,115,232,0.22);
    }
    .app-header h1 {
        color: #ffffff !important;
        font-size: 1.75rem !important;
        font-weight: 700 !important;
        margin: 0 0 5px 0 !important;
        letter-spacing: -0.2px;
    }
    .app-header p {
        color: #bbdefb !important;
        font-size: 0.92rem !important;
        margin: 0 0 12px 0 !important;
    }
    .badge-row { display: flex; gap: 8px; flex-wrap: wrap; }
    .badge {
        background: rgba(255,255,255,0.12);
        border: 1px solid rgba(255,255,255,0.25);
        border-radius: 20px;
        padding: 3px 12px;
        font-size: 0.73rem;
        color: #e3f2fd;
        font-weight: 500;
    }

    /* ── Page background & card panels ── */
    .gradio-container .prose,
    .block, fieldset {
        background: #ffffff !important;
        border: 1px solid #e2e8f0 !important;
        border-radius: 8px !important;
        box-shadow: 0 1px 4px rgba(0,0,0,0.05) !important;
    }

    /* ── Tab bar ── */
    .tabs > .tab-nav {
        background: #ffffff !important;
        border-bottom: 2px solid #90caf9 !important;
        padding: 0 8px !important;
    }
    .tabs > .tab-nav button {
        color: #64748b !important;
        font-weight: 600 !important;
        font-size: 0.88rem !important;
        padding: 10px 22px !important;
        border: none !important;
        background: transparent !important;
        border-bottom: 2px solid transparent !important;
        margin-bottom: -2px !important;
        transition: color 0.15s !important;
    }
    .tabs > .tab-nav button.selected {
        color: #1557b0 !important;
        border-bottom: 2px solid #1a73e8 !important;
        background: transparent !important;
    }
    .tabs > .tab-nav button:hover:not(.selected) {
        color: #1a73e8 !important;
        background: #e8f0fe !important;
    }

    /* ── Section labels ── */
    .section-label {
        color: #0d47a1 !important;
        font-size: 0.75rem !important;
        font-weight: 700 !important;
        letter-spacing: 0.07em !important;
        text-transform: uppercase !important;
        border-left: 3px solid #1a73e8 !important;
        padding-left: 9px !important;
        margin: 16px 0 6px 0 !important;
        background: none !important;
        border-radius: 0 !important;
        box-shadow: none !important;
    }

    /* ── Labels & info text ── */
    label span {
        color: #374151 !important;
        font-size: 0.82rem !important;
        font-weight: 600 !important;
    }
    .info { color: #94a3b8 !important; font-size: 0.77rem !important; }

    /* ── Inputs, textareas, selects ── */
    input, textarea, select {
        background: #ffffff !important;
        color: #111827 !important;
        border: 1px solid #cbd5e1 !important;
        border-radius: 6px !important;
        font-size: 0.9rem !important;
        transition: border-color 0.15s, box-shadow 0.15s !important;
    }
    input:focus, textarea:focus {
        border-color: #1a73e8 !important;
        box-shadow: 0 0 0 3px rgba(26,115,232,0.12) !important;
        outline: none !important;
    }

    /* ── Slider ── */
    input[type=range] {
        accent-color: #1a73e8 !important;
        background: transparent !important;
        border: none !important;
        box-shadow: none !important;
    }

    /* ── Multiselect dropdown ── */
    .multiselect, .multiselect > div {
        background: #ffffff !important;
        border: 1.5px solid #90caf9 !important;
        border-radius: 8px !important;
        min-height: 52px !important;
    }
    .multiselect:focus-within {
        border-color: #1a73e8 !important;
        box-shadow: 0 0 0 3px rgba(26,115,232,0.12) !important;
    }
    /* Selected link chips — Google blue */
    .multiselect .token, .multiselect [data-token] {
        background: #1a73e8 !important;
        color: #ffffff !important;
        border-radius: 5px !important;
        padding: 3px 10px !important;
        font-size: 0.78rem !important;
        font-weight: 500 !important;
        display: inline-flex !important;
        align-items: center !important;
        gap: 5px !important;
        margin: 2px !important;
    }
    .multiselect .token button, .multiselect [data-token] button {
        background: transparent !important;
        color: #bbdefb !important;
        border: none !important;
        font-size: 0.9rem !important;
        cursor: pointer !important;
        padding: 0 2px !important;
    }
    .multiselect .token button:hover { color: #ffffff !important; }
    /* Dropdown option list */
    .multiselect .dropdown, .multiselect ul {
        background: #ffffff !important;
        border: 1px solid #90caf9 !important;
        border-radius: 8px !important;
        box-shadow: 0 4px 16px rgba(0,0,0,0.10) !important;
        max-height: 260px !important;
        overflow-y: auto !important;
    }
    .multiselect li, .multiselect .option {
        color: #0d47a1 !important;
        font-size: 0.83rem !important;
        padding: 8px 14px !important;
        cursor: pointer !important;
        border-bottom: 1px solid #f1f5f9 !important;
    }
    .multiselect li:hover, .multiselect .option:hover {
        background: #e8f0fe !important;
        color: #1557b0 !important;
    }
    .multiselect li.selected, .multiselect .option.selected {
        background: #bbdefb !important;
        color: #0d47a1 !important;
        font-weight: 600 !important;
    }

    /* ── Primary button ── */
    button.primary {
        background: #1a73e8 !important;
        color: #ffffff !important;
        border: none !important;
        border-radius: 7px !important;
        font-weight: 700 !important;
        font-size: 0.9rem !important;
        padding: 10px 24px !important;
        box-shadow: 0 1px 6px rgba(26,115,232,0.30) !important;
        transition: background 0.15s, box-shadow 0.15s, transform 0.1s !important;
    }
    button.primary:hover {
        background: #1557b0 !important;
        box-shadow: 0 3px 12px rgba(21,87,176,0.35) !important;
        transform: translateY(-1px) !important;
    }
    button.primary:active { transform: translateY(0) !important; }

    /* ── Secondary button ── */
    button.secondary {
        background: #ffffff !important;
        color: #1a73e8 !important;
        border: 1.5px solid #1a73e8 !important;
        border-radius: 7px !important;
        font-weight: 600 !important;
        font-size: 0.9rem !important;
        transition: background 0.15s, box-shadow 0.15s !important;
    }
    button.secondary:hover {
        background: #e8f0fe !important;
        border-color: #1557b0 !important;
        box-shadow: 0 1px 6px rgba(26,115,232,0.15) !important;
    }

    /* ── Live log box (dark terminal, blue text) ── */
    .log-box textarea {
        font-family: 'Consolas', 'Courier New', monospace !important;
        font-size: 0.82rem !important;
        background: #0a1628 !important;
        color: #90caf9 !important;
        border: 1px solid #90caf9 !important;
        border-radius: 6px !important;
        line-height: 1.65 !important;
    }

    /* ── Answer box ── */
    .answer-box textarea {
        background: #e8f0fe !important;
        color: #111827 !important;
        font-size: 0.93rem !important;
        line-height: 1.75 !important;
        border: 1px solid #90caf9 !important;
        border-radius: 6px !important;
    }

    /* ── Status box ── */
    .status-box textarea, .status-box input {
        background: #f0fdf4 !important;
        color: #166534 !important;
        border: 1px solid #bbf7d0 !important;
        font-weight: 600 !important;
        font-size: 0.85rem !important;
    }

    /* ── Chunks badge ── */
    .chunks-badge input {
        background: #e8f0fe !important;
        color: #1557b0 !important;
        border: 1px solid #90caf9 !important;
        font-weight: 700 !important;
        font-size: 1.1rem !important;
        text-align: center !important;
    }

    /* ── Accordion ── */
    .accordion {
        background: #ffffff !important;
        border: 1px solid #e2e8f0 !important;
        border-radius: 8px !important;
    }
    .accordion .label-wrap span {
        color: #1a73e8 !important;
        font-weight: 600 !important;
    }
    """

    with gr.Blocks(title="Kamal Gemini Web Crawler & Q&A", css=PRO_CSS) as demo:

        # ── State components (invisible, persist across button clicks) ─
        vectorstore_state = gr.State(None)   # Holds the FAISS index object
        num_chunks_state  = gr.State(0)      # Number of indexed chunks

        # ── Hero Header ───────────────────────────────────────────────
        gr.HTML("""
        <div class="app-header">
            <h1>💎 Kamal AI Web Crawler &amp; Question Answering</h1>
            <p>Scrape any website &rarr; Index its content &rarr; Ask questions using Google Gemini</p>
            <div class="badge-row">
                <span class="badge">🔵 Google Gemini</span>
                <span class="badge">🔗 LangChain RAG</span>
                <span class="badge">🗄️ FAISS Vector Search</span>
                <span class="badge">🤗 HuggingFace Embeddings</span>
            </div>
        </div>
        """)

        # ─────────────────────────────────────────────────────────────
        # TAB 1: SCRAPE & INDEX
        # ─────────────────────────────────────────────────────────────
        with gr.Tab("🌐 Scrape & Index"):