Skip to content
This repository was archived by the owner on Mar 7, 2026. It is now read-only.

Commit 20dccc5

Browse files
authored
feat: edit ui + add return html option (#90)
* fix: restyle the element table * chore: wip ui * wip: edit styles * feat: add html return * fix: build * fix: workflow * fix: workflow * fix: workflow * fix: workflow * fix: workflow * fix: workflow * fix: workflow * fix: cypress test * chore: update photo [skip ci]
1 parent 02619eb commit 20dccc5

25 files changed

Lines changed: 627 additions & 282 deletions

File tree

.github/actions/run-cypress-tests/action.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,5 +73,8 @@ runs:
7373
7474
- name: Run Cypress tests
7575
shell: bash
76-
run: npm run cy:run
76+
run: |
77+
set -e
78+
npm run cy:run
79+
7780

.github/workflows/cypress-tests.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,14 @@ jobs:
1818
uses: ./.github/actions/run-cypress-tests
1919
with:
2020
openai_key: ${{ secrets.openai_key }}
21-
continue-on-error: true
2221

2322
- name: Check container logs on failure
24-
if: steps.run-tests.outcome == 'failure'
23+
if: steps.run-tests.conclusion == 'failure'
2524
run: |
2625
echo "Cypress tests failed. Dumping container logs..."
2726
docker logs scraperr_api || true
2827
2928
- name: Fail job if Cypress failed
30-
if: steps.run-tests.outcome == 'failure'
29+
if: steps.run-tests.conclusion == 'failure'
3130
run: exit 1
32-
- uses: actions/checkout@v4
31+

.github/workflows/tests.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ jobs:
2222
uses: ./.github/actions/run-cypress-tests
2323
with:
2424
openai_key: ${{ secrets.openai_key }}
25-
continue-on-error: true
2625

2726
success-message:
2827
runs-on: ubuntu-latest

api/backend/ai/agent/agent.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ async def scrape_with_agent(agent_job: dict[str, Any]):
6363

6464
xpaths = parse_response(response)
6565

66-
captured_elements = await capture_elements(page, xpaths)
66+
captured_elements = await capture_elements(
67+
page, xpaths, agent_job["job_options"]["return_html"]
68+
)
6769

6870
final_url = page.url
6971

api/backend/ai/agent/utils.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def parse_next_page(text: str) -> str | None:
206206

207207

208208
async def capture_elements(
209-
page: Page, xpaths: list[dict[str, str]]
209+
page: Page, xpaths: list[dict[str, str]], return_html: bool
210210
) -> list[CapturedElement]:
211211
captured_elements = []
212212
seen_texts = set()
@@ -217,6 +217,23 @@ async def capture_elements(
217217
count = await locator.count()
218218

219219
for i in range(count):
220+
if return_html:
221+
element_text = (
222+
await page.locator(f"xpath={xpath['xpath']}")
223+
.nth(i)
224+
.inner_html()
225+
)
226+
227+
seen_texts.add(element_text)
228+
captured_elements.append(
229+
CapturedElement(
230+
name=xpath["name"],
231+
text=element_text,
232+
xpath=xpath["xpath"],
233+
)
234+
)
235+
continue
236+
220237
element_text = ""
221238

222239
element_handle = await locator.nth(i).element_handle()

api/backend/job/models/job_options.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,4 @@ class JobOptions(BaseModel):
2525
site_map: Optional[SiteMap] = None
2626
collect_media: bool = False
2727
custom_cookies: list[dict[str, Any]] = []
28+
return_html: bool = False

api/backend/job/scraping/scraping.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,9 @@ async def make_site_request(
110110
)
111111

112112

113-
async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]):
113+
async def collect_scraped_elements(
114+
page: tuple[str, str], xpaths: list[Element], return_html: bool
115+
):
114116
soup = BeautifulSoup(page[0], "lxml")
115117
root = etree.HTML(str(soup))
116118

@@ -120,6 +122,16 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
120122
el = sxpath(root, elem.xpath)
121123

122124
for e in el: # type: ignore
125+
if return_html:
126+
elements[elem.name] = [
127+
CapturedElement(
128+
xpath=elem.xpath,
129+
text=page[0],
130+
name=elem.name,
131+
)
132+
]
133+
continue
134+
123135
text = (
124136
" ".join(str(t) for t in e.itertext())
125137
if isinstance(e, etree._Element)
@@ -161,6 +173,8 @@ async def scrape(
161173
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
162174

163175
for page in pages:
164-
elements.append(await collect_scraped_elements(page, xpaths))
176+
elements.append(
177+
await collect_scraped_elements(page, xpaths, job_options["return_html"])
178+
)
165179

166180
return elements

cypress/utilities/job.utilities.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,27 +100,27 @@ export const waitForJobCompletion = (url: string) => {
100100
};
101101

102102
export const enableMultiPageScraping = () => {
103-
cy.get("button").contains("Advanced Job Options").click();
103+
cy.get("button").contains("Advanced Options").click();
104104
cy.get('[data-cy="multi-page-toggle"]').click();
105105
cy.get("body").type("{esc}");
106106
};
107107

108108
export const addCustomHeaders = (headers: Record<string, string>) => {
109-
cy.get("button").contains("Advanced Job Options").click();
109+
cy.get("button").contains("Advanced Options").click();
110110
cy.get('[name="custom_headers"]').type(JSON.stringify(headers), {
111111
parseSpecialCharSequences: false,
112112
});
113113
cy.get("body").type("{esc}");
114114
};
115115

116116
export const addCustomCookies = (cookies: Record<string, string>) => {
117-
cy.get("button").contains("Advanced Job Options").click();
117+
cy.get("button").contains("Advanced Options").click();
118118
cy.get('[name="custom_cookies"]').type(JSON.stringify(cookies));
119119
cy.get("body").type("{esc}");
120120
};
121121

122122
export const openAdvancedJobOptions = () => {
123-
cy.get("button").contains("Advanced Job Options").click();
123+
cy.get("button").contains("Advanced Options").click();
124124
};
125125

126126
export const selectJobFromSelector = () => {

docs/main_page.png

18.7 KB
Loading

src/components/common/advanced-job-options/advanced-job-options.tsx

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
import { Box, Link, Typography } from "@mui/material";
2-
import { SetStateAction, Dispatch, useState } from "react";
3-
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
41
import { RawJobOptions } from "@/types";
2+
import SettingsIcon from "@mui/icons-material/Settings";
3+
import { Box, Button, Typography } from "@mui/material";
4+
import { Dispatch, SetStateAction, useState } from "react";
5+
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
56

67
export type AdvancedJobOptionsProps = {
78
jobOptions: RawJobOptions;
@@ -17,26 +18,27 @@ export const AdvancedJobOptions = ({
1718
const [open, setOpen] = useState(false);
1819

1920
return (
20-
<Box sx={{ mb: 2 }}>
21-
<Link
22-
component="button"
23-
variant="body2"
21+
<Box sx={{ display: "flex", alignItems: "center", gap: 1 }}>
22+
<Button
23+
variant="outlined"
2424
onClick={() => setOpen(true)}
25+
startIcon={<SettingsIcon />}
2526
sx={{
26-
textDecoration: "none",
27-
color: "primary.main",
27+
textTransform: "none",
28+
borderRadius: 2,
29+
px: 2,
30+
py: 1,
31+
borderColor: "divider",
32+
color: "text.secondary",
2833
"&:hover": {
29-
color: "primary.dark",
30-
textDecoration: "underline",
34+
borderColor: "primary.main",
35+
color: "primary.main",
36+
bgcolor: "action.hover",
3137
},
32-
paddingLeft: 1,
33-
display: "inline-flex",
34-
alignItems: "center",
35-
gap: 0.5,
3638
}}
3739
>
38-
<Typography variant="body2">Advanced Job Options</Typography>
39-
</Link>
40+
<Typography variant="body2">Advanced Options</Typography>
41+
</Button>
4042

4143
<AdvancedJobOptionsDialog
4244
open={open}

0 commit comments

Comments
 (0)