Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions plugins/extractors/confluence/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Confluence

Extract page metadata and relationships from Confluence spaces using the Confluence REST API v2.

## Usage

```yaml
source:
name: confluence
scope: my-confluence
config:
base_url: https://mycompany.atlassian.net/wiki
username: user@company.com
token: your-api-token
spaces:
- ENG
- DATA
exclude:
- ARCHIVE
```

## Configuration

| Key | Type | Required | Description |
| :-- | :--- | :------- | :---------- |
| `base_url` | `string` | Yes | Confluence base URL (e.g. `https://mycompany.atlassian.net/wiki`). |
| `username` | `string` | Yes | Atlassian account email for API authentication. |
| `token` | `string` | Yes | Atlassian API token. |
| `spaces` | `[]string` | No | Space keys to extract. Defaults to all spaces. |
| `exclude` | `[]string` | No | Space keys to exclude from extraction. |

## Entities

The extractor emits two entity types and their relationships as edges.

### Entity: `space`

| Field | Sample Value |
| :---- | :----------- |
| `urn` | `urn:confluence:my-confluence:space:ENG` |
| `name` | `Engineering` |
| `description` | `Engineering team documentation` |
| `properties.space_key` | `ENG` |
| `properties.space_type` | `global` |
| `properties.status` | `current` |
| `properties.web_url` | `https://mycompany.atlassian.net/wiki/spaces/ENG` |

### Entity: `document`

| Field | Sample Value |
| :---- | :----------- |
| `urn` | `urn:confluence:my-confluence:document:12345` |
| `name` | `Data Pipeline Architecture` |
| `properties.page_id` | `12345` |
| `properties.space_key` | `ENG` |
| `properties.status` | `current` |
| `properties.version` | `5` |
| `properties.labels` | `["architecture", "data"]` |
| `properties.created_at` | `2024-01-15T10:30:00Z` |
| `properties.updated_at` | `2024-03-20T14:15:00Z` |
| `properties.web_url` | `https://mycompany.atlassian.net/wiki/spaces/ENG/pages/12345` |

### Edges

| Type | Source | Target | Description |
| :--- | :----- | :----- | :---------- |
| `belongs_to` | `document` | `space` | Page belongs to a space |
| `child_of` | `document` | `document` | Page is a child of another page |
| `owned_by` | `document` | `user` | Page is owned by its author |
| `documented_by` | `document` | any | Page references a data asset via URN in its content |

### URN Reference Detection

The extractor scans page content for URN patterns (`urn:service:scope:type:id`) and emits `documented_by` edges linking the page to referenced data assets. This enables connecting business documentation to technical metadata.

## Contributing

Refer to the [contribution guidelines](../../../docs/docs/contribute/guide.md#adding-a-new-extractor) for information on contributing to this module.
228 changes: 228 additions & 0 deletions plugins/extractors/confluence/client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
package confluence

import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
)

// Page represents a Confluence page from the v2 API.
type Page struct {
ID string `json:"id"`
Title string `json:"title"`
Status string `json:"status"`
SpaceID string `json:"spaceId"`
ParentID string `json:"parentId"`
AuthorID string `json:"authorId"`
CreatedAt time.Time `json:"createdAt"`
Version struct {
Number int `json:"number"`
AuthorID string `json:"authorId"`
CreatedAt time.Time `json:"createdAt"`
} `json:"version"`
Body struct {
Storage struct {
Value string `json:"value"`
} `json:"storage"`
} `json:"body"`
Labels struct {
Results []Label `json:"results"`
} `json:"labels"`
Links struct {
WebUI string `json:"webui"`
} `json:"_links"`
}

// Space represents a Confluence space.
type Space struct {
ID string `json:"id"`
Key string `json:"key"`
Name string `json:"name"`
Type string `json:"type"`
Status string `json:"status"`
Description struct {
Plain struct {
Value string `json:"value"`
} `json:"plain"`
} `json:"description"`
Links struct {
WebUI string `json:"webui"`
} `json:"_links"`
}

// Label represents a Confluence label.
type Label struct {
ID string `json:"id"`
Name string `json:"name"`
}

type pageResponse struct {
Results []Page `json:"results"`
Links struct {
Next string `json:"next"`
} `json:"_links"`
}

type spaceResponse struct {
Results []Space `json:"results"`
Links struct {
Next string `json:"next"`
} `json:"_links"`
}

// Client wraps the Confluence REST API v2.
type Client struct {
baseURL string
httpClient *http.Client
username string
token string
}

// NewClient creates a new Confluence API client.
func NewClient(baseURL, username, token string) *Client {
return &Client{
baseURL: strings.TrimRight(baseURL, "/"),
httpClient: &http.Client{Timeout: 30 * time.Second},
username: username,
token: token,
}
}

// GetSpaces returns all spaces, optionally filtered by keys.
func (c *Client) GetSpaces(ctx context.Context, keys []string) ([]Space, error) {
var all []Space
cursor := ""
for {
params := url.Values{}
params.Set("limit", "25")
if len(keys) > 0 {
params.Set("keys", strings.Join(keys, ","))
}
if cursor != "" {
params.Set("cursor", cursor)
}

var resp spaceResponse
if err := c.get(ctx, "/api/v2/spaces", params, &resp); err != nil {
return nil, fmt.Errorf("get spaces: %w", err)
}
all = append(all, resp.Results...)

cursor = parseCursor(resp.Links.Next)
if cursor == "" {
break
}
}
return all, nil
}

// GetPages returns all pages in a space.
func (c *Client) GetPages(ctx context.Context, spaceID string) ([]Page, error) {
var all []Page
cursor := ""
for {
params := url.Values{}
params.Set("space-id", spaceID)
params.Set("limit", "25")
params.Set("body-format", "storage")
if cursor != "" {
params.Set("cursor", cursor)
}

var resp pageResponse
if err := c.get(ctx, "/api/v2/pages", params, &resp); err != nil {
return nil, fmt.Errorf("get pages for space %s: %w", spaceID, err)
}
all = append(all, resp.Results...)

cursor = parseCursor(resp.Links.Next)
if cursor == "" {
break
}
}
return all, nil
}

// GetPageLabels returns all labels for a page, handling pagination.
func (c *Client) GetPageLabels(ctx context.Context, pageID string) ([]Label, error) {
var all []Label
cursor := ""
for {
params := url.Values{}
params.Set("limit", "25")
if cursor != "" {
params.Set("cursor", cursor)
}

var resp struct {
Results []Label `json:"results"`
Links struct {
Next string `json:"next"`
} `json:"_links"`
}
if err := c.get(ctx, "/api/v2/pages/"+pageID+"/labels", params, &resp); err != nil {
return nil, fmt.Errorf("get labels for page %s: %w", pageID, err)
}
all = append(all, resp.Results...)

cursor = parseCursor(resp.Links.Next)
if cursor == "" {
break
}
}
return all, nil
}

func (c *Client) get(ctx context.Context, path string, params url.Values, out any) error {
u := c.baseURL + path
if len(params) > 0 {
u += "?" + params.Encode()
}

req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return fmt.Errorf("create request: %w", err)
}
req.SetBasicAuth(c.username, c.token)
req.Header.Set("Accept", "application/json")

resp, err := c.httpClient.Do(req)
if err != nil {
return fmt.Errorf("execute request: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("unexpected status %d: %s", resp.StatusCode, truncate(string(body), 200))
}

if err := json.NewDecoder(resp.Body).Decode(out); err != nil {
return fmt.Errorf("decode response: %w", err)
}
return nil
}

// parseCursor extracts the cursor parameter from a next-link URL.
func parseCursor(nextLink string) string {
if nextLink == "" {
return ""
}
u, err := url.Parse(nextLink)
if err != nil {
return ""
}
return u.Query().Get("cursor")
}

func truncate(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n] + "..."
}
Loading