Skip to content

Commit b578eee

Browse files
committed
feat: add Notion metadata extractor
Extract page and database metadata from Notion workspaces via the Notion API. Emits document entities for both pages and databases with child_of, belongs_to, owned_by, and documented_by edges. Reads page block content to scan for URN references linking documentation to data assets. Closes #503 (Notion portion)
1 parent b560a76 commit b578eee

6 files changed

Lines changed: 1344 additions & 0 deletions

File tree

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Notion
2+
3+
Extract page and database metadata from a Notion workspace using the Notion API.
4+
5+
## Usage
6+
7+
```yaml
8+
source:
9+
name: notion
10+
scope: my-workspace
11+
config:
12+
token: ntn_your_integration_token
13+
extract:
14+
- pages
15+
- databases
16+
```
17+
18+
## Configuration
19+
20+
| Key | Type | Required | Description |
21+
| :-- | :--- | :------- | :---------- |
22+
| `token` | `string` | Yes | Notion internal integration token. |
23+
| `base_url` | `string` | No | Override Notion API base URL. Defaults to `https://api.notion.com`. |
24+
| `extract` | `[]string` | No | Entity types to extract. Defaults to all: `pages`, `databases`. |
25+
26+
## Entities
27+
28+
The extractor emits `document` entities for both pages and databases.
29+
30+
### Entity: `document` (page)
31+
32+
| Field | Sample Value |
33+
| :---- | :----------- |
34+
| `urn` | `urn:notion:my-workspace:document:abc123-def456` |
35+
| `name` | `Data Pipeline Architecture` |
36+
| `properties.page_id` | `abc123-def456` |
37+
| `properties.created_at` | `2024-01-15T10:30:00Z` |
38+
| `properties.updated_at` | `2024-03-20T14:15:00Z` |
39+
| `properties.created_by` | `Alice` |
40+
| `properties.last_edited_by` | `Bob` |
41+
| `properties.web_url` | `https://www.notion.so/Data-Pipeline-abc123` |
42+
| `properties.archived` | `false` |
43+
44+
### Entity: `document` (database)
45+
46+
| Field | Sample Value |
47+
| :---- | :----------- |
48+
| `urn` | `urn:notion:my-workspace:document:db-789` |
49+
| `name` | `Project Tracker` |
50+
| `description` | `Track all engineering projects` |
51+
| `properties.database_id` | `db-789` |
52+
| `properties.created_at` | `2024-01-10T09:00:00Z` |
53+
| `properties.updated_at` | `2024-03-18T16:00:00Z` |
54+
| `properties.created_by` | `Alice` |
55+
| `properties.columns` | `["Name", "Status", "Priority"]` |
56+
| `properties.web_url` | `https://www.notion.so/db-789` |
57+
58+
### Edges
59+
60+
| Type | Source | Target | Description |
61+
| :--- | :----- | :----- | :---------- |
62+
| `child_of` | `document` | `document` | Page is a child of another page |
63+
| `belongs_to` | `document` | `document` | Page belongs to a database |
64+
| `owned_by` | `document` | `user` | Page/database is owned by its creator |
65+
| `documented_by` | `document` | any | Page references a data asset via URN in its content |
66+
67+
### URN Reference Detection
68+
69+
The extractor reads page block content and scans for URN patterns (`urn:service:scope:type:id`), emitting `documented_by` edges to link documentation to data assets.
70+
71+
## Contributing
72+
73+
Refer to the [contribution guidelines](../../../docs/docs/contribute/guide.md#adding-a-new-extractor) for information on contributing to this module.
Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
package notion
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"encoding/json"
7+
"fmt"
8+
"io"
9+
"net/http"
10+
"time"
11+
)
12+
13+
const (
14+
defaultBaseURL = "https://api.notion.com"
15+
notionAPIVersion = "2022-06-28"
16+
defaultPageSize = 100
17+
)
18+
19+
// Page represents a Notion page.
20+
type Page struct {
21+
ID string `json:"id"`
22+
CreatedTime time.Time `json:"created_time"`
23+
LastEditedTime time.Time `json:"last_edited_time"`
24+
CreatedBy User `json:"created_by"`
25+
LastEditedBy User `json:"last_edited_by"`
26+
Archived bool `json:"archived"`
27+
URL string `json:"url"`
28+
Parent Parent `json:"parent"`
29+
Properties map[string]any `json:"properties"`
30+
}
31+
32+
// Database represents a Notion database.
33+
type Database struct {
34+
ID string `json:"id"`
35+
CreatedTime time.Time `json:"created_time"`
36+
LastEditedTime time.Time `json:"last_edited_time"`
37+
CreatedBy User `json:"created_by"`
38+
LastEditedBy User `json:"last_edited_by"`
39+
Title []RichText `json:"title"`
40+
Description []RichText `json:"description"`
41+
Archived bool `json:"archived"`
42+
URL string `json:"url"`
43+
Parent Parent `json:"parent"`
44+
Properties map[string]any `json:"properties"`
45+
}
46+
47+
// User represents a Notion user.
48+
type User struct {
49+
ID string `json:"id"`
50+
Name string `json:"name"`
51+
}
52+
53+
// Parent represents the parent of a page or database.
54+
type Parent struct {
55+
Type string `json:"type"`
56+
PageID string `json:"page_id,omitempty"`
57+
DatabaseID string `json:"database_id,omitempty"`
58+
WorkspaceID string `json:"workspace,omitempty"`
59+
}
60+
61+
// RichText represents a Notion rich text object.
62+
type RichText struct {
63+
PlainText string `json:"plain_text"`
64+
}
65+
66+
// searchResponse is the response from the Notion search API.
67+
type searchResponse struct {
68+
Results []json.RawMessage `json:"results"`
69+
HasMore bool `json:"has_more"`
70+
NextCursor string `json:"next_cursor"`
71+
}
72+
73+
// blockChildrenResponse is the response from the block children API.
74+
type blockChildrenResponse struct {
75+
Results []Block `json:"results"`
76+
HasMore bool `json:"has_more"`
77+
NextCursor string `json:"next_cursor"`
78+
}
79+
80+
// Block represents a Notion block (used for reading page content).
81+
type Block struct {
82+
ID string `json:"id"`
83+
Type string `json:"type"`
84+
// We flatten all block types into a generic map for URN scanning.
85+
Paragraph *blockContent `json:"paragraph,omitempty"`
86+
Heading1 *blockContent `json:"heading_1,omitempty"`
87+
Heading2 *blockContent `json:"heading_2,omitempty"`
88+
Heading3 *blockContent `json:"heading_3,omitempty"`
89+
BulletedList *blockContent `json:"bulleted_list_item,omitempty"`
90+
NumberedList *blockContent `json:"numbered_list_item,omitempty"`
91+
Quote *blockContent `json:"quote,omitempty"`
92+
Callout *blockContent `json:"callout,omitempty"`
93+
Code *blockContent `json:"code,omitempty"`
94+
}
95+
96+
type blockContent struct {
97+
RichText []RichText `json:"rich_text"`
98+
}
99+
100+
// PlainText extracts all plain text from a block.
101+
func (b *Block) PlainText() string {
102+
for _, content := range []*blockContent{
103+
b.Paragraph, b.Heading1, b.Heading2, b.Heading3,
104+
b.BulletedList, b.NumberedList, b.Quote, b.Callout, b.Code,
105+
} {
106+
if content == nil {
107+
continue
108+
}
109+
var text string
110+
for _, rt := range content.RichText {
111+
text += rt.PlainText
112+
}
113+
if text != "" {
114+
return text
115+
}
116+
}
117+
return ""
118+
}
119+
120+
// Client wraps the Notion API.
121+
type Client struct {
122+
baseURL string
123+
httpClient *http.Client
124+
token string
125+
}
126+
127+
// NewClient creates a new Notion API client.
128+
func NewClient(token string) *Client {
129+
return &Client{
130+
baseURL: defaultBaseURL,
131+
httpClient: &http.Client{Timeout: 30 * time.Second},
132+
token: token,
133+
}
134+
}
135+
136+
// SetBaseURL overrides the API base URL (used for testing).
137+
func (c *Client) SetBaseURL(url string) {
138+
c.baseURL = url
139+
}
140+
141+
// SearchPages returns all pages, optionally filtered by query.
142+
func (c *Client) SearchPages(ctx context.Context) ([]Page, error) {
143+
var all []Page
144+
cursor := ""
145+
for {
146+
body := map[string]any{
147+
"filter": map[string]any{"value": "page", "property": "object"},
148+
"page_size": defaultPageSize,
149+
}
150+
if cursor != "" {
151+
body["start_cursor"] = cursor
152+
}
153+
154+
var resp searchResponse
155+
if err := c.post(ctx, "/v1/search", body, &resp); err != nil {
156+
return nil, fmt.Errorf("search pages: %w", err)
157+
}
158+
159+
for _, raw := range resp.Results {
160+
var page Page
161+
if err := json.Unmarshal(raw, &page); err != nil {
162+
return nil, fmt.Errorf("unmarshal page: %w", err)
163+
}
164+
all = append(all, page)
165+
}
166+
167+
if !resp.HasMore || resp.NextCursor == "" {
168+
break
169+
}
170+
cursor = resp.NextCursor
171+
}
172+
return all, nil
173+
}
174+
175+
// SearchDatabases returns all databases.
176+
func (c *Client) SearchDatabases(ctx context.Context) ([]Database, error) {
177+
var all []Database
178+
cursor := ""
179+
for {
180+
body := map[string]any{
181+
"filter": map[string]any{"value": "database", "property": "object"},
182+
"page_size": defaultPageSize,
183+
}
184+
if cursor != "" {
185+
body["start_cursor"] = cursor
186+
}
187+
188+
var resp searchResponse
189+
if err := c.post(ctx, "/v1/search", body, &resp); err != nil {
190+
return nil, fmt.Errorf("search databases: %w", err)
191+
}
192+
193+
for _, raw := range resp.Results {
194+
var db Database
195+
if err := json.Unmarshal(raw, &db); err != nil {
196+
return nil, fmt.Errorf("unmarshal database: %w", err)
197+
}
198+
all = append(all, db)
199+
}
200+
201+
if !resp.HasMore || resp.NextCursor == "" {
202+
break
203+
}
204+
cursor = resp.NextCursor
205+
}
206+
return all, nil
207+
}
208+
209+
// GetBlockChildren returns the top-level blocks of a page or block.
210+
func (c *Client) GetBlockChildren(ctx context.Context, blockID string) ([]Block, error) {
211+
var all []Block
212+
cursor := ""
213+
for {
214+
path := fmt.Sprintf("/v1/blocks/%s/children?page_size=%d", blockID, defaultPageSize)
215+
if cursor != "" {
216+
path += "&start_cursor=" + cursor
217+
}
218+
219+
var resp blockChildrenResponse
220+
if err := c.get(ctx, path, &resp); err != nil {
221+
return nil, fmt.Errorf("get block children for %s: %w", blockID, err)
222+
}
223+
all = append(all, resp.Results...)
224+
225+
if !resp.HasMore || resp.NextCursor == "" {
226+
break
227+
}
228+
cursor = resp.NextCursor
229+
}
230+
return all, nil
231+
}
232+
233+
func (c *Client) post(ctx context.Context, path string, body any, out any) error {
234+
jsonBody, err := json.Marshal(body)
235+
if err != nil {
236+
return fmt.Errorf("marshal request: %w", err)
237+
}
238+
239+
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(jsonBody))
240+
if err != nil {
241+
return fmt.Errorf("create request: %w", err)
242+
}
243+
c.setHeaders(req)
244+
req.Header.Set("Content-Type", "application/json")
245+
246+
return c.do(req, out)
247+
}
248+
249+
func (c *Client) get(ctx context.Context, path string, out any) error {
250+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
251+
if err != nil {
252+
return fmt.Errorf("create request: %w", err)
253+
}
254+
c.setHeaders(req)
255+
256+
return c.do(req, out)
257+
}
258+
259+
func (c *Client) setHeaders(req *http.Request) {
260+
req.Header.Set("Authorization", "Bearer "+c.token)
261+
req.Header.Set("Notion-Version", notionAPIVersion)
262+
req.Header.Set("Accept", "application/json")
263+
}
264+
265+
func (c *Client) do(req *http.Request, out any) error {
266+
resp, err := c.httpClient.Do(req)
267+
if err != nil {
268+
return fmt.Errorf("execute request: %w", err)
269+
}
270+
defer resp.Body.Close()
271+
272+
if resp.StatusCode != http.StatusOK {
273+
body, _ := io.ReadAll(resp.Body)
274+
return fmt.Errorf("unexpected status %d: %s", resp.StatusCode, truncate(string(body), 200))
275+
}
276+
277+
if err := json.NewDecoder(resp.Body).Decode(out); err != nil {
278+
return fmt.Errorf("decode response: %w", err)
279+
}
280+
return nil
281+
}
282+
283+
func truncate(s string, n int) string {
284+
if len(s) <= n {
285+
return s
286+
}
287+
return s[:n] + "..."
288+
}

0 commit comments

Comments
 (0)