raystack · ravisuhag · Apr 18, 2026 · Apr 18, 2026 · Apr 18, 2026
diff --git a/plugins/extractors/confluence/README.md b/plugins/extractors/confluence/README.md
@@ -0,0 +1,78 @@
+# Confluence
+
+Extract page metadata and relationships from Confluence spaces using the Confluence REST API v2.
+
+## Usage
+
+```yaml
+source:
+  name: confluence
+  scope: my-confluence
+  config:
+    base_url: https://mycompany.atlassian.net/wiki
+    username: user@company.com
+    token: your-api-token
+    spaces:
+      - ENG
+      - DATA
+    exclude:
+      - ARCHIVE
+```
+
+## Configuration
+
+| Key | Type | Required | Description |
+| :-- | :--- | :------- | :---------- |
+| `base_url` | `string` | Yes | Confluence base URL (e.g. `https://mycompany.atlassian.net/wiki`). |
+| `username` | `string` | Yes | Atlassian account email for API authentication. |
+| `token` | `string` | Yes | Atlassian API token. |
+| `spaces` | `[]string` | No | Space keys to extract. Defaults to all spaces. |
+| `exclude` | `[]string` | No | Space keys to exclude from extraction. |
+
+## Entities
+
+The extractor emits two entity types and their relationships as edges.
+
+### Entity: `space`
+
+| Field | Sample Value |
+| :---- | :----------- |
+| `urn` | `urn:confluence:my-confluence:space:ENG` |
+| `name` | `Engineering` |
+| `description` | `Engineering team documentation` |
+| `properties.space_key` | `ENG` |
+| `properties.space_type` | `global` |
+| `properties.status` | `current` |
+| `properties.web_url` | `https://mycompany.atlassian.net/wiki/spaces/ENG` |
+
+### Entity: `document`
+
+| Field | Sample Value |
+| :---- | :----------- |
+| `urn` | `urn:confluence:my-confluence:document:12345` |
+| `name` | `Data Pipeline Architecture` |
+| `properties.page_id` | `12345` |
+| `properties.space_key` | `ENG` |
+| `properties.status` | `current` |
+| `properties.version` | `5` |
+| `properties.labels` | `["architecture", "data"]` |
+| `properties.created_at` | `2024-01-15T10:30:00Z` |
+| `properties.updated_at` | `2024-03-20T14:15:00Z` |
+| `properties.web_url` | `https://mycompany.atlassian.net/wiki/spaces/ENG/pages/12345` |
+
+### Edges
+
+| Type | Source | Target | Description |
+| :--- | :----- | :----- | :---------- |
+| `belongs_to` | `document` | `space` | Page belongs to a space |
+| `child_of` | `document` | `document` | Page is a child of another page |
+| `owned_by` | `document` | `user` | Page is owned by its author |
+| `documented_by` | `document` | any | Page references a data asset via URN in its content |
+
+### URN Reference Detection
+
+The extractor scans page content for URN patterns (`urn:service:scope:type:id`) and emits `documented_by` edges linking the page to referenced data assets. This enables connecting business documentation to technical metadata.
+
+## Contributing
+
+Refer to the [contribution guidelines](../../../docs/docs/contribute/guide.md#adding-a-new-extractor) for information on contributing to this module.
diff --git a/plugins/extractors/confluence/client.go b/plugins/extractors/confluence/client.go
@@ -0,0 +1,228 @@
+package confluence
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"strings"
+	"time"
+)
+
+// Page represents a Confluence page from the v2 API.
+type Page struct {
+	ID        string    `json:"id"`
+	Title     string    `json:"title"`
+	Status    string    `json:"status"`
+	SpaceID   string    `json:"spaceId"`
+	ParentID  string    `json:"parentId"`
+	AuthorID  string    `json:"authorId"`
+	CreatedAt time.Time `json:"createdAt"`
+	Version   struct {
+		Number    int       `json:"number"`
+		AuthorID  string    `json:"authorId"`
+		CreatedAt time.Time `json:"createdAt"`
+	} `json:"version"`
+	Body struct {
+		Storage struct {
+			Value string `json:"value"`
+		} `json:"storage"`
+	} `json:"body"`
+	Labels struct {
+		Results []Label `json:"results"`
+	} `json:"labels"`
+	Links struct {
+		WebUI string `json:"webui"`
+	} `json:"_links"`
+}
+
+// Space represents a Confluence space.
+type Space struct {
+	ID          string `json:"id"`
+	Key         string `json:"key"`
+	Name        string `json:"name"`
+	Type        string `json:"type"`
+	Status      string `json:"status"`
+	Description struct {
+		Plain struct {
+			Value string `json:"value"`
+		} `json:"plain"`
+	} `json:"description"`
+	Links struct {
+		WebUI string `json:"webui"`
+	} `json:"_links"`
+}
+
+// Label represents a Confluence label.
+type Label struct {
+	ID   string `json:"id"`
+	Name string `json:"name"`
+}
+
+type pageResponse struct {
+	Results []Page `json:"results"`
+	Links   struct {
+		Next string `json:"next"`
+	} `json:"_links"`
+}
+
+type spaceResponse struct {
+	Results []Space `json:"results"`
+	Links   struct {
+		Next string `json:"next"`
+	} `json:"_links"`
+}
+
+// Client wraps the Confluence REST API v2.
+type Client struct {
+	baseURL    string
+	httpClient *http.Client
+	username   string
+	token      string
+}
+
+// NewClient creates a new Confluence API client.
+func NewClient(baseURL, username, token string) *Client {
+	return &Client{
+		baseURL:    strings.TrimRight(baseURL, "/"),
+		httpClient: &http.Client{Timeout: 30 * time.Second},
+		username:   username,
+		token:      token,
+	}
+}
+
+// GetSpaces returns all spaces, optionally filtered by keys.
+func (c *Client) GetSpaces(ctx context.Context, keys []string) ([]Space, error) {
+	var all []Space
+	cursor := ""
+	for {
+		params := url.Values{}
+		params.Set("limit", "25")
+		if len(keys) > 0 {
+			params.Set("keys", strings.Join(keys, ","))
+		}
+		if cursor != "" {
+			params.Set("cursor", cursor)
+		}
+
+		var resp spaceResponse
+		if err := c.get(ctx, "/api/v2/spaces", params, &resp); err != nil {
+			return nil, fmt.Errorf("get spaces: %w", err)
+		}
+		all = append(all, resp.Results...)
+
+		cursor = parseCursor(resp.Links.Next)
+		if cursor == "" {
+			break
+		}
+	}
+	return all, nil
+}
+
+// GetPages returns all pages in a space.
+func (c *Client) GetPages(ctx context.Context, spaceID string) ([]Page, error) {
+	var all []Page
+	cursor := ""
+	for {
+		params := url.Values{}
+		params.Set("space-id", spaceID)
+		params.Set("limit", "25")
+		params.Set("body-format", "storage")
+		if cursor != "" {
+			params.Set("cursor", cursor)
+		}
+
+		var resp pageResponse
+		if err := c.get(ctx, "/api/v2/pages", params, &resp); err != nil {
+			return nil, fmt.Errorf("get pages for space %s: %w", spaceID, err)
+		}
+		all = append(all, resp.Results...)
+
+		cursor = parseCursor(resp.Links.Next)
+		if cursor == "" {
+			break
+		}
+	}
+	return all, nil
+}
+
+// GetPageLabels returns all labels for a page, handling pagination.
+func (c *Client) GetPageLabels(ctx context.Context, pageID string) ([]Label, error) {
+	var all []Label
+	cursor := ""
+	for {
+		params := url.Values{}
+		params.Set("limit", "25")
+		if cursor != "" {
+			params.Set("cursor", cursor)
+		}
+
+		var resp struct {
+			Results []Label `json:"results"`
+			Links   struct {
+				Next string `json:"next"`
+			} `json:"_links"`
+		}
+		if err := c.get(ctx, "/api/v2/pages/"+pageID+"/labels", params, &resp); err != nil {
+			return nil, fmt.Errorf("get labels for page %s: %w", pageID, err)
+		}
+		all = append(all, resp.Results...)
+
+		cursor = parseCursor(resp.Links.Next)
+		if cursor == "" {
+			break
+		}
+	}
+	return all, nil
+}
+
+func (c *Client) get(ctx context.Context, path string, params url.Values, out any) error {
+	u := c.baseURL + path
+	if len(params) > 0 {
+		u += "?" + params.Encode()
+	}
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
+	if err != nil {
+		return fmt.Errorf("create request: %w", err)
+	}
+	req.SetBasicAuth(c.username, c.token)
+	req.Header.Set("Accept", "application/json")
+
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("execute request: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("unexpected status %d: %s", resp.StatusCode, truncate(string(body), 200))
+	}
+
+	if err := json.NewDecoder(resp.Body).Decode(out); err != nil {
+		return fmt.Errorf("decode response: %w", err)
+	}
+	return nil
+}
+
+// parseCursor extracts the cursor parameter from a next-link URL.
+func parseCursor(nextLink string) string {
+	if nextLink == "" {
+		return ""
+	}
+	u, err := url.Parse(nextLink)
+	if err != nil {
+		return ""
+	}
+	return u.Query().Get("cursor")
+}
+
+func truncate(s string, n int) string {
+	if len(s) <= n {
+		return s
+	}
+	return s[:n] + "..."
+}