dify-plugin-mem0/fetch_mem0_docs_browser.py at main · Feversun/dify-plugin-mem0 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env python3
"""
Fetch Mem0 API documentation using browser automation
to handle client-side rendered content
"""

import os
import time
import json
from datetime import datetime

# Try to use playwright, fallback to selenium if not available
try:
    from playwright.sync_api import sync_playwright
    USE_PLAYWRIGHT = True
except ImportError:
    USE_PLAYWRIGHT = False
    try:
        from selenium import webdriver
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.support import expected_conditions as EC
        from selenium.webdriver.chrome.options import Options
    except ImportError:
        print("Please install either playwright or selenium:")
        print("pip install playwright && playwright install")
        print("OR")
        print("pip install selenium")
        exit(1)

# URLs to fetch from sitemap
URLS_TO_FETCH = [
    ("https://docs.mem0.ai/api-reference/memory/v2-get-memories", "v2-get-memories.md"),
    ("https://docs.mem0.ai/api-reference/memory/v2-search-memories", "v2-search-memories.md"),
    ("https://docs.mem0.ai/api-reference/memory/add-memories", "add-memories.md"),
    ("https://docs.mem0.ai/api-reference/memory/update-memory", "update-memory.md"),
    ("https://docs.mem0.ai/api-reference/memory/delete-memory", "delete-memory.md"),
    ("https://docs.mem0.ai/api-reference/memory/delete-memories", "delete-memories.md"),
    ("https://docs.mem0.ai/api-reference/memory/create-memory-export", "create-memory-export.md"),
    ("https://docs.mem0.ai/api-reference/memory/get-memory-export", "get-memory-export.md"),
    ("https://docs.mem0.ai/api-reference/entities/get-users", "get-users.md"),
    ("https://docs.mem0.ai/api-reference/entities/delete-user", "delete-user.md"),
    ("https://docs.mem0.ai/api-reference/organization/create-org", "create-org.md"),
    ("https://docs.mem0.ai/api-reference/organization/get-orgs", "get-orgs.md"),
    ("https://docs.mem0.ai/api-reference/organization/get-org", "get-org.md"),
    ("https://docs.mem0.ai/api-reference/organization/get-org-members", "get-org-members.md"),
    ("https://docs.mem0.ai/api-reference/project/get-projects", "get-projects.md"),
    ("https://docs.mem0.ai/api-reference/webhook/get-webhook", "get-webhook.md"),
]

OUTPUT_DIR = "mem0-api-docs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def fetch_with_playwright(url):
    """Fetch content using Playwright"""
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        print(f"Loading {url}...")
        try:
            # Increase timeout and wait for DOM content loaded instead of networkidle
            page.goto(url, wait_until="domcontentloaded", timeout=60000)

            # Wait for content to load with specific selectors
            try:
                # Wait for API content to appear
                page.wait_for_selector('[data-testid="api-content"], .api-reference-content, main', timeout=10000)
            except:
                # If specific selectors don't exist, just wait a bit
                page.wait_for_timeout(5000)

            # Try to find the main content area
            content = ""

            # Try multiple strategies to get content
            selectors = [
                '[data-testid="api-content"]',
                '.api-reference-content',
                'article',
                'main',
                '.documentation-content',
                '.api-content',
                '[role="main"]',
                '.content',
                '#content',
                'body'
            ]

            for selector in selectors:
                try:
                    element = page.query_selector(selector)
                    if element:
                        text_content = element.inner_text()
                        if text_content and len(text_content) > 100:  # Make sure we got meaningful content
                            content = text_content
                            break
                except:
                    continue

            # If still no content, get the full page
            if not content:
                content = page.content()

        except Exception as e:
            print(f"Error loading page: {str(e)}")
            content = f"Error: {str(e)}"
        finally:
            browser.close()

        return content

def fetch_with_selenium(url):
    """Fetch content using Selenium"""
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    driver = webdriver.Chrome(options=options)

    try:
        print(f"Loading {url}...")
        driver.get(url)

        # Wait for page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        # Additional wait for dynamic content
        time.sleep(3)

        # Try to find main content
        content = ""

        # Try different selectors
        selectors = ['main', '.documentation-content', '.api-content', '[role="main"]', 'body']

        for selector in selectors:
            try:
                element = driver.find_element(By.CSS_SELECTOR, selector)
                if element and element.text:
                    content = element.text
                    break
            except:
                continue

        # If no content found, get full page source
        if not content:
            content = driver.page_source

        return content

    finally:
        driver.quit()

def extract_api_info(content, url):
    """Extract API information from the fetched content"""
    # This is a simple extraction, might need adjustment based on actual content structure
    doc = {
        "source": url,
        "title": url.split('/')[-1].replace('-', ' ').title(),
        "date_created": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        "content": content
    }

    return doc

def save_documentation(doc, filename):
    """Save documentation to markdown file"""
    filepath = os.path.join(OUTPUT_DIR, filename)

    md_content = f"""---
source: {doc['source']}
title: {doc['title']}
date_created: {doc['date_created']}
---

# {doc['title']}

{doc['content']}
"""

    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(md_content)

    print(f"Saved: {filepath}")

def main():
    """Main function to fetch all documentation"""
    print(f"Using {'Playwright' if USE_PLAYWRIGHT else 'Selenium'} for browser automation")
    print(f"Fetching {len(URLS_TO_FETCH)} API documentation pages...\n")

    for url, filename in URLS_TO_FETCH:
        try:
            # Fetch content
            if USE_PLAYWRIGHT:
                content = fetch_with_playwright(url)
            else:
                content = fetch_with_selenium(url)

            # Extract API information
            doc = extract_api_info(content, url)

            # Save documentation
            save_documentation(doc, filename)

            # Small delay to be respectful
            time.sleep(1)

        except Exception as e:
            print(f"Error fetching {url}: {str(e)}")
            continue

    print(f"\nDocumentation fetching complete! Files saved in: {os.path.abspath(OUTPUT_DIR)}")

if __name__ == "__main__":
    main()