Skip to content

Commit fca4a24

Browse files
committed
feat: add Confluence metadata extractor
Extract page metadata and relationships from Confluence spaces via the REST API v2. Emits space and document entities with belongs_to, child_of, owned_by, and documented_by edges. Scans page content for URN references to auto-link documentation to data assets.
1 parent 84a63a2 commit fca4a24

6 files changed

Lines changed: 1192 additions & 0 deletions

File tree

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Confluence
2+
3+
Extract page metadata and relationships from Confluence spaces using the Confluence REST API v2.
4+
5+
## Usage
6+
7+
```yaml
8+
source:
9+
name: confluence
10+
scope: my-confluence
11+
config:
12+
base_url: https://mycompany.atlassian.net/wiki
13+
username: user@company.com
14+
token: your-api-token
15+
spaces:
16+
- ENG
17+
- DATA
18+
exclude:
19+
- ARCHIVE
20+
```
21+
22+
## Configuration
23+
24+
| Key | Type | Required | Description |
25+
| :-- | :--- | :------- | :---------- |
26+
| `base_url` | `string` | Yes | Confluence base URL (e.g. `https://mycompany.atlassian.net/wiki`). |
27+
| `username` | `string` | Yes | Atlassian account email for API authentication. |
28+
| `token` | `string` | Yes | Atlassian API token. |
29+
| `spaces` | `[]string` | No | Space keys to extract. Defaults to all spaces. |
30+
| `exclude` | `[]string` | No | Space keys to exclude from extraction. |
31+
32+
## Entities
33+
34+
The extractor emits two entity types and their relationships as edges.
35+
36+
### Entity: `space`
37+
38+
| Field | Sample Value |
39+
| :---- | :----------- |
40+
| `urn` | `urn:confluence:my-confluence:space:ENG` |
41+
| `name` | `Engineering` |
42+
| `description` | `Engineering team documentation` |
43+
| `properties.space_key` | `ENG` |
44+
| `properties.space_type` | `global` |
45+
| `properties.status` | `current` |
46+
| `properties.web_url` | `https://mycompany.atlassian.net/wiki/spaces/ENG` |
47+
48+
### Entity: `document`
49+
50+
| Field | Sample Value |
51+
| :---- | :----------- |
52+
| `urn` | `urn:confluence:my-confluence:document:12345` |
53+
| `name` | `Data Pipeline Architecture` |
54+
| `properties.page_id` | `12345` |
55+
| `properties.space_key` | `ENG` |
56+
| `properties.status` | `current` |
57+
| `properties.version` | `5` |
58+
| `properties.labels` | `["architecture", "data"]` |
59+
| `properties.created_at` | `2024-01-15T10:30:00Z` |
60+
| `properties.updated_at` | `2024-03-20T14:15:00Z` |
61+
| `properties.web_url` | `https://mycompany.atlassian.net/wiki/spaces/ENG/pages/12345` |
62+
63+
### Edges
64+
65+
| Type | Source | Target | Description |
66+
| :--- | :----- | :----- | :---------- |
67+
| `belongs_to` | `document` | `space` | Page belongs to a space |
68+
| `child_of` | `document` | `document` | Page is a child of another page |
69+
| `owned_by` | `document` | `user` | Page is owned by its author |
70+
| `documented_by` | `document` | any | Page references a data asset via URN in its content |
71+
72+
### URN Reference Detection
73+
74+
The extractor scans page content for URN patterns (`urn:service:scope:type:id`) and emits `documented_by` edges linking the page to referenced data assets. This enables connecting business documentation to technical metadata.
75+
76+
## Contributing
77+
78+
Refer to the [contribution guidelines](../../../docs/docs/contribute/guide.md#adding-a-new-extractor) for information on contributing to this module.
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
package confluence
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"io"
8+
"net/http"
9+
"net/url"
10+
"strings"
11+
"time"
12+
)
13+
14+
// Page represents a Confluence page from the v2 API.
15+
type Page struct {
16+
ID string `json:"id"`
17+
Title string `json:"title"`
18+
Status string `json:"status"`
19+
SpaceID string `json:"spaceId"`
20+
ParentID string `json:"parentId"`
21+
AuthorID string `json:"authorId"`
22+
CreatedAt time.Time `json:"createdAt"`
23+
Version struct {
24+
Number int `json:"number"`
25+
AuthorID string `json:"authorId"`
26+
CreatedAt time.Time `json:"createdAt"`
27+
} `json:"version"`
28+
Body struct {
29+
Storage struct {
30+
Value string `json:"value"`
31+
} `json:"storage"`
32+
} `json:"body"`
33+
Labels struct {
34+
Results []Label `json:"results"`
35+
} `json:"labels"`
36+
Links struct {
37+
WebUI string `json:"webui"`
38+
} `json:"_links"`
39+
}
40+
41+
// Space represents a Confluence space.
42+
type Space struct {
43+
ID string `json:"id"`
44+
Key string `json:"key"`
45+
Name string `json:"name"`
46+
Type string `json:"type"`
47+
Status string `json:"status"`
48+
Description struct {
49+
Plain struct {
50+
Value string `json:"value"`
51+
} `json:"plain"`
52+
} `json:"description"`
53+
Links struct {
54+
WebUI string `json:"webui"`
55+
} `json:"_links"`
56+
}
57+
58+
// Label represents a Confluence label.
59+
type Label struct {
60+
ID string `json:"id"`
61+
Name string `json:"name"`
62+
}
63+
64+
type pageResponse struct {
65+
Results []Page `json:"results"`
66+
Links struct {
67+
Next string `json:"next"`
68+
} `json:"_links"`
69+
}
70+
71+
type spaceResponse struct {
72+
Results []Space `json:"results"`
73+
Links struct {
74+
Next string `json:"next"`
75+
} `json:"_links"`
76+
}
77+
78+
// Client wraps the Confluence REST API v2.
79+
type Client struct {
80+
baseURL string
81+
httpClient *http.Client
82+
username string
83+
token string
84+
}
85+
86+
// NewClient creates a new Confluence API client.
87+
func NewClient(baseURL, username, token string) *Client {
88+
return &Client{
89+
baseURL: strings.TrimRight(baseURL, "/"),
90+
httpClient: &http.Client{Timeout: 30 * time.Second},
91+
username: username,
92+
token: token,
93+
}
94+
}
95+
96+
// GetSpaces returns all spaces, optionally filtered by keys.
97+
func (c *Client) GetSpaces(ctx context.Context, keys []string) ([]Space, error) {
98+
var all []Space
99+
cursor := ""
100+
for {
101+
params := url.Values{}
102+
params.Set("limit", "25")
103+
if len(keys) > 0 {
104+
params.Set("keys", strings.Join(keys, ","))
105+
}
106+
if cursor != "" {
107+
params.Set("cursor", cursor)
108+
}
109+
110+
var resp spaceResponse
111+
if err := c.get(ctx, "/api/v2/spaces", params, &resp); err != nil {
112+
return nil, fmt.Errorf("get spaces: %w", err)
113+
}
114+
all = append(all, resp.Results...)
115+
116+
cursor = parseCursor(resp.Links.Next)
117+
if cursor == "" {
118+
break
119+
}
120+
}
121+
return all, nil
122+
}
123+
124+
// GetPages returns all pages in a space.
125+
func (c *Client) GetPages(ctx context.Context, spaceID string) ([]Page, error) {
126+
var all []Page
127+
cursor := ""
128+
for {
129+
params := url.Values{}
130+
params.Set("space-id", spaceID)
131+
params.Set("limit", "25")
132+
params.Set("body-format", "storage")
133+
if cursor != "" {
134+
params.Set("cursor", cursor)
135+
}
136+
137+
var resp pageResponse
138+
if err := c.get(ctx, "/api/v2/pages", params, &resp); err != nil {
139+
return nil, fmt.Errorf("get pages for space %s: %w", spaceID, err)
140+
}
141+
all = append(all, resp.Results...)
142+
143+
cursor = parseCursor(resp.Links.Next)
144+
if cursor == "" {
145+
break
146+
}
147+
}
148+
return all, nil
149+
}
150+
151+
// GetPageLabels returns labels for a page.
152+
func (c *Client) GetPageLabels(ctx context.Context, pageID string) ([]Label, error) {
153+
var resp struct {
154+
Results []Label `json:"results"`
155+
}
156+
if err := c.get(ctx, "/api/v2/pages/"+pageID+"/labels", nil, &resp); err != nil {
157+
return nil, fmt.Errorf("get labels for page %s: %w", pageID, err)
158+
}
159+
return resp.Results, nil
160+
}
161+
162+
func (c *Client) get(ctx context.Context, path string, params url.Values, out any) error {
163+
u := c.baseURL + path
164+
if len(params) > 0 {
165+
u += "?" + params.Encode()
166+
}
167+
168+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
169+
if err != nil {
170+
return fmt.Errorf("create request: %w", err)
171+
}
172+
req.SetBasicAuth(c.username, c.token)
173+
req.Header.Set("Accept", "application/json")
174+
175+
resp, err := c.httpClient.Do(req)
176+
if err != nil {
177+
return fmt.Errorf("execute request: %w", err)
178+
}
179+
defer resp.Body.Close()
180+
181+
if resp.StatusCode != http.StatusOK {
182+
body, _ := io.ReadAll(resp.Body)
183+
return fmt.Errorf("unexpected status %d: %s", resp.StatusCode, truncate(string(body), 200))
184+
}
185+
186+
if err := json.NewDecoder(resp.Body).Decode(out); err != nil {
187+
return fmt.Errorf("decode response: %w", err)
188+
}
189+
return nil
190+
}
191+
192+
// parseCursor extracts the cursor parameter from a next-link URL.
193+
func parseCursor(nextLink string) string {
194+
if nextLink == "" {
195+
return ""
196+
}
197+
u, err := url.Parse(nextLink)
198+
if err != nil {
199+
return ""
200+
}
201+
return u.Query().Get("cursor")
202+
}
203+
204+
func truncate(s string, n int) string {
205+
if len(s) <= n {
206+
return s
207+
}
208+
return s[:n] + "..."
209+
}

0 commit comments

Comments
 (0)