crawlcore
diff --git a/‎docs/cli-tools.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/cli-tools.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/implementation.md‎
Lines changed: 23 additions & 21 deletions b/‎docs/implementation.md‎
Lines changed: 23 additions & 21 deletions
diff --git a/‎docs/index.md‎
Lines changed: 32 additions & 26 deletions b/‎docs/index.md‎
Lines changed: 32 additions & 26 deletions
diff --git a/‎reference-impl/scp/parser.py‎
Lines changed: 65 additions & 24 deletions b/‎reference-impl/scp/parser.py‎
Lines changed: 65 additions & 24 deletions
diff --git a/‎reference-impl/tests/test_parser.py‎
Lines changed: 0 additions & 8 deletions b/‎reference-impl/tests/test_parser.py‎
Lines changed: 0 additions & 8 deletions
@@ -77,7 +77,7 @@ scp-inspect --json collection.scp.gz > output.json
 scp-inspect --url "https://example.com/page" collection.scp.gz
 
 # Show only pages modified after date
-scp-inspect --since "2025-01-15T00:00:00Z" collection.scp.gz
+scp-inspect --since "2000-01-15T00:00:00Z" collection.scp.gz
 ```
 
 
@@ -100,7 +100,7 @@ scp-inspect --json collection.scp.gz > data.json
 
 **View recent changes (delta)**:
 ```bash
-scp-inspect --pages --since "2025-01-15T00:00:00Z" collection.scp.gz
+scp-inspect --pages --since "2000-01-15T00:00:00Z" collection.scp.gz
 ```
 
 **Debug content blocks**:
 
@@ -31,13 +31,13 @@ for page in pages[:5]:
 ```python
 from scp.generator import SCPGenerator
 
-gen = SCPGenerator("blog-snapshot-2025-01", "blog", "snapshot")
+gen = SCPGenerator("blog-snapshot-q1", "blog", "snapshot")
 
 gen.add_page(
     url="https://example.com/post1",
     title="First Post",
     description="My first blog post",
-    modified="2025-01-15T10:00:00Z",
+    modified="2000-01-15T10:00:00Z",
     language="en",
     content=[
         {"type": "heading", "level": 1, "text": "First Post"},
@@ -57,7 +57,7 @@ from scp.generator import SCPGenerator
 
 # Create generator for snapshot
 gen = SCPGenerator(
-    collection_id="blog-snapshot-2025-01-15",
+    collection_id="blog-snapshot-day15",
     section="blog",
     collection_type="snapshot"
 )
@@ -109,20 +109,22 @@ gen.save(f"blog-delta-{datetime.now().strftime('%Y-%m-%d')}.scp.gz", compress="g
 
 ## Hosting Collections
 
-### Cloudflare R2 (example)
+### Using Object Storage or CDN
+
+Upload collections to any S3-compatible object storage or CDN using tools like rclone, aws-cli, or provider-specific clients:
 
 ```bash
-# Upload to Cloudflare R2 using rclone
-rclone copy blog-snapshot.scp.gz r2:your-bucket/collections/
+# Example: Upload using rclone (works with AWS S3, Azure Blob, Google Cloud Storage, etc.)
+rclone copy blog-snapshot.scp.gz remote:your-bucket/collections/
 ```
 
-Configure rclone for R2:
+Configure rclone for S3-compatible storage:
 ```bash
-rclone config create r2 s3 \
-  provider Cloudflare \
-  access_key_id your-r2-access-key \
-  secret_access_key your-r2-secret-key \
-  endpoint https://your-account-id.r2.cloudflarestorage.com
+rclone config create remote s3 \
+  provider <your-provider> \
+  access_key_id your-access-key \
+  secret_access_key your-secret-key \
+  endpoint https://your-storage-endpoint.com
 ```
 
 ### Update Sitemap.xml
@@ -141,21 +143,21 @@ sitemap.add_section("blog", update_freq="daily", pages=5247)
 sitemap.add_collection(
     section="blog",
     collection_type="snapshot",
-    url="https://cdn.example.com/collections/blog-snapshot-2025-01-15.scp.gz",
-    generated="2025-01-15T00:00:00Z",
+    url="https://cdn.example.com/collections/blog-snapshot-day15.scp.gz",
+    generated="2000-01-15T00:00:00Z",
     pages=5247,
     size=52000000
 )
 
 # Add delta
 sitemap.add_delta(
     section="blog",
-    period="2025-01-15",
-    url="https://cdn.example.com/collections/blog-delta-2025-01-15.scp.gz",
-    generated="2025-01-15T23:00:00Z",
+    period="day15",
+    url="https://cdn.example.com/collections/blog-delta-day15.scp.gz",
+    generated="2000-01-15T23:00:00Z",
     pages=47,
     size=480000,
-    since="2025-01-14T00:00:00Z"
+    since="2000-01-14T00:00:00Z"
 )
 
 # Save sitemap
@@ -173,14 +175,14 @@ sitemap.save("sitemap.xml")
 # Generate snapshot
 python generate_snapshot.py
 
-# Upload to R2
-rclone copy blog-snapshot.scp.gz r2:your-bucket/collections/
+# Upload to object storage
+rclone copy blog-snapshot.scp.gz remote:your-bucket/collections/
 
 # Update sitemap
 python update_sitemap.py
 
 # Upload sitemap
-rclone copy sitemap.xml r2:your-bucket/
+rclone copy sitemap.xml remote:your-bucket/
 ```
 
 Schedule with cron:
 
@@ -2,36 +2,48 @@
 
 ## What is SCP?
 
-The Site Content Protocol (SCP) is a collection-based format for efficiently serving web content to crawlers while users continue accessing regular HTML pages.
+The Site Content Protocol (SCP) is a format for serving clean, structured web content to AI training systems and search engines. Websites provide pre-generated JSON collections optimized for machine consumption, while end users continue accessing regular HTML pages.
 
 ## Problem
 
-Web crawlers (search engines, AI bots, aggregators) consume massive bandwidth and server resources by parsing web-pages designed for human viewing.
-With the explosion of AI crawlers, this traffic has become a significant cost for websites and strain on internet infrastructure.
+AI training systems and search engines need massive web content datasets, but current HTML scraping approaches create three critical problems:
 
-Sources:
-
-- [Cloudflare Year in Review 2025](https://radar.cloudflare.com/year-in-review/2025)
-- [FOSS Infrastructure Under Attack by AI Companies](https://thelibre.news/foss-infrastructure-is-under-attack-by-ai-companies/)
-- [Web Scraping Market Report 2025](https://scrapeops.io/web-scraping-playbook/web-scraping-market-report-2025/)
+1. **Low-quality training data** - Content extracted from HTML is contaminated with navigation menus, advertisements, boilerplate text, and formatting markup, degrading model training quality.
+2. **High infrastructure costs** - Processing complete HTML/CSS/JavaScript responses for millions of pages creates substantial bandwidth and computational overhead for both publishers and crawlers.
+3. **Legal and ethical uncertainty** - Automated scraping exists in a gray area. Websites lack a clear, voluntary mechanism to contribute high-quality content to AI training while maintaining control over their intellectual property.
 
 ## Solution
 
-Websites pre-generate compressed collections and host them on CDN or Cloud Object Storage:
+SCP provides a voluntary, structured alternative to HTML scraping:
+
+**For Publishers:**
+
+- Generate clean JSON collections from your CMS/database (not HTML parsing)
+- Host compressed files on CDN or object storage
+- Declare collection availability in sitemap.xml
+- Maintain full control over what content is included
+
+**For Crawlers:**
+
+- Download entire content sections in one request
+- Receive structured data optimized for training/indexing
+- Use efficient delta updates (only changed pages)
+- Respect publisher-provided content boundaries
 
-1. Website generates `blog-snapshot-2025-01-15.scp.gz` (5,247 pages → 52 MB)
+**Example:**
+
+1. Website generates `blog-snapshot-day15.scp.gz` (5,247 pages → 52 MB)
 2. Uploads to CDN or Cloud Object Storage
-3. Declares availability of content collections in sitemap.xml
-4. Crawler downloads entire collection in one request
-5. Later: crawler downloads delta `blog-delta-2025-01-16.scp.gz` (47 pages → 480 KB)
+3. Crawler downloads entire collection in one request
+4. Later: crawler downloads delta `blog-delta-day16.scp.gz` (47 pages → 480 KB)
 
 ### Expected Impact
 
-- 50-60% bandwidth reduction for initial snapshots
-- 90-95% bandwidth reduction with delta updates
-- Faster parsing than HTML/CSS/JS
-- 90% fewer requests (one download fetches entire sections)
-- Zero impact on user experience
+- **Clean training data**: Structured content without navigation menus, ads, boilerplate, or formatting markup
+- **Voluntary contribution**: Clear mechanism for sites to contribute high-quality content to AI training with explicit consent
+- **Reduced infrastructure costs**: Lower bandwidth and processing overhead for both publishers and crawlers
+- **Efficient updates**: Delta collections deliver only changed pages, minimizing redundant transfers
+- **Zero user impact**: End users continue accessing regular HTML pages
 
 ## Documentation
 
@@ -54,16 +66,10 @@ Websites pre-generate compressed collections and host them on CDN or Cloud Objec
 
 **Next Steps**:
 
-1. Community feedback (3 months)
+1. Community feedback (1 month)
      - Post to Hacker News, Reddit, tech blogs
      - Iterate on spec based on feedback
-2. Creation of IETF Internet-Draft (2 months)
-
-**Future**:
-
-- Bot verification using [Web Bot Auth](https://developers.cloudflare.com/bots/reference/bot-verification/web-bot-auth/)
-- Pay-per-crawl model similar to [Cloudflare's Pay Per Crawl](https://blog.cloudflare.com/introducing-pay-per-crawl/)
-
+2. Update of IETF Internet-Draft (2 weeks)
 
 ## License
 
 
@@ -2,13 +2,16 @@
 
 from __future__ import annotations
 
+import logging
 from pathlib import Path
 
 import orjson
 
 from scp import checksum, compression, schema
 from scp.exceptions import ChecksumError, DecompressionError, SizeLimitError, ValidationError
 
+logger = logging.getLogger(__name__)
+
 
 class CollectionMetadata:
     """Collection metadata from first line of SCP file."""
@@ -66,14 +69,16 @@ class SCPParser:
     Parses line-by-line for memory efficiency.
     """
 
-    def __init__(self, validate: bool = True, strict: bool = False):
+    def __init__(self, strict: bool = False):
         """Initialize parser.
 
         Args:
-            validate: Whether to validate against JSON schemas (default: True)
             strict: Whether to raise on non-fatal errors like unknown content blocks
+
+        Note:
+            Validation is mandatory per SCP v0.1 specification.
+            Parsers MUST validate collection metadata and page objects.
         """
-        self.validate = validate
         self.strict = strict
         self.metadata: CollectionMetadata | None = None
         self.pages: list[Page] = []
@@ -130,9 +135,8 @@ def parse_file(self, file_path: str | Path) -> tuple[CollectionMetadata, list[Pa
         if "collection" not in metadata_dict:
             raise ValidationError("First line must contain collection metadata")
 
-        # Validate metadata
-        if self.validate:
-            schema.validate_collection_metadata(metadata_dict)
+        # Validate metadata (mandatory per SCP v0.1 spec)
+        schema.validate_collection_metadata(metadata_dict)
 
         self.metadata = CollectionMetadata(metadata_dict)
 
@@ -168,16 +172,34 @@ def parse_file(self, file_path: str | Path) -> tuple[CollectionMetadata, list[Pa
                     f"({page_size} > {schema.MAX_PAGE_SIZE})"
                 )
 
-            # Validate page
-            if self.validate:
-                try:
-                    schema.validate_page(page_dict)
-                except ValidationError as e:
-                    error_msg = f"Page validation failed at line {line_num}: {e}"
-                    if self.strict:
-                        raise ValidationError(error_msg) from e
-                    self._errors.append(error_msg)
-                    continue
+            # Validate page (mandatory per SCP v0.1 spec)
+            try:
+                schema.validate_page(page_dict)
+            except ValidationError as e:
+                error_msg = f"Page validation failed at line {line_num}: {e}"
+                if self.strict:
+                    raise ValidationError(error_msg) from e
+                self._errors.append(error_msg)
+                continue
+
+            # Check for unknown content block types (SCP v0.1 spec: MUST log warning, MUST continue)
+            if "content" in page_dict:
+                for block_idx, block in enumerate(page_dict["content"]):
+                    if isinstance(block, dict) and "type" in block:
+                        try:
+                            is_known = schema.validate_content_block(block)
+                            if not is_known:
+                                # MUST log warning for unknown types
+                                warning_msg = (
+                                    f"Unknown content block type '{block['type']}' "
+                                    f"at line {line_num}, block {block_idx}. "
+                                    f"Skipping block and continuing (per SCP v0.1 spec)."
+                                )
+                                logger.warning(warning_msg)
+                                self._errors.append(warning_msg)
+                        except ValidationError:
+                            # Known type but validation failed - already handled by page validation
+                            pass
 
             try:
                 page = Page(page_dict)
@@ -214,10 +236,9 @@ def parse_bytes(self, data: bytes) -> tuple[CollectionMetadata, list[Page]]:
         if not lines:
             raise ValidationError("Empty data")
 
-        # Parse metadata
+        # Parse metadata (mandatory per SCP v0.1 spec)
         metadata_dict = orjson.loads(lines[0])
-        if self.validate:
-            schema.validate_collection_metadata(metadata_dict)
+        schema.validate_collection_metadata(metadata_dict)
         self.metadata = CollectionMetadata(metadata_dict)
 
         # Verify checksum
@@ -231,26 +252,46 @@ def parse_bytes(self, data: bytes) -> tuple[CollectionMetadata, list[Page]]:
                 continue
 
             page_dict = orjson.loads(line)
-            if self.validate:
-                schema.validate_page(page_dict)
+            # Validate page (mandatory per SCP v0.1 spec)
+            schema.validate_page(page_dict)
+
+            # Check for unknown content block types (SCP v0.1 spec: MUST log warning, MUST continue)
+            if "content" in page_dict:
+                for block_idx, block in enumerate(page_dict["content"]):
+                    if isinstance(block, dict) and "type" in block:
+                        try:
+                            is_known = schema.validate_content_block(block)
+                            if not is_known:
+                                # MUST log warning for unknown types
+                                warning_msg = (
+                                    f"Unknown content block type '{block['type']}' "
+                                    f"at line {line_num}, block {block_idx}. "
+                                    f"Skipping block and continuing (per SCP v0.1 spec)."
+                                )
+                                logger.warning(warning_msg)
+                        except ValidationError:
+                            # Known type but validation failed - already handled by page validation
+                            pass
 
             self.pages.append(Page(page_dict))
 
         return self.metadata, self.pages
 
 
 def parse_collection(
-    file_path: str | Path, validate: bool = True, strict: bool = False
+    file_path: str | Path, strict: bool = False
 ) -> tuple[CollectionMetadata, list[Page]]:
     """Parse SCP collection file (convenience function).
 
     Args:
         file_path: Path to .scp, .scp.gz, or .scp.zst file
-        validate: Whether to validate against JSON schemas
         strict: Whether to raise on non-fatal errors
 
     Returns:
         Tuple of (metadata, pages)
+
+    Note:
+        Validation is mandatory per SCP v0.1 specification.
     """
-    parser = SCPParser(validate=validate, strict=strict)
+    parser = SCPParser(strict=strict)
     return parser.parse_file(file_path)
@@ -124,14 +124,6 @@ def test_parse_with_checksum(tmp_path: Path) -> None:
     assert metadata.checksum.startswith("sha256:")
 
 
-def test_parse_without_validation(example_snapshot: Path) -> None:
-    """Test parsing without schema validation."""
-    scp_parser = parser.SCPParser(validate=False)
-    metadata, pages = scp_parser.parse_file(example_snapshot)
-
-    assert len(pages) == 2
-
-
 def test_parse_delta_collection(tmp_path: Path) -> None:
     """Test parsing delta collection."""
     gen = generator.SCPGenerator("test-delta", "blog", "delta", since="2025-01-14T00:00:00Z")