preswald/simple_encoding_test.py at 56fd523242bddc2d8dfdc6caf4d8a3abdc182b07 · StructuredLabs/preswald · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
"""
Simple test script to verify that DuckDB's CSV encoding support works correctly.
This script creates a test CSV file with ISO-8859-1 encoding and tests loading it.
"""

import os
import tempfile
import duckdb

def create_test_csv_with_latin1():
    """Create a test CSV file with ISO-8859-1 encoding containing special characters."""
    # Create a temporary file
    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='latin-1') as f:
        # Write CSV content with Latin-1 characters
        f.write("name,value,description\n")
        f.write("José,123,áéíóú\n")
        f.write("François,456,ñç\n")
        f.write("Müller,789,ßäöü\n")
        temp_file = f.name

    return temp_file

def test_utf8_encoding_fails():
    """Test that loading with UTF-8 encoding fails on Latin-1 file."""
    temp_file = create_test_csv_with_latin1()

    try:
        # Try to load with UTF-8 encoding (should fail)
        conn = duckdb.connect(':memory:')
        result = conn.execute(f"""
            SELECT * FROM read_csv_auto('{temp_file}',
                header=true,
                auto_detect=true,
                ignore_errors=true,
                normalize_names=false,
                sample_size=-1,
                all_varchar=true,
                encoding='utf-8'
            )
        """).df()

        print("✓ UTF-8 encoding test completed")

    except Exception as e:
        print(f"✓ UTF-8 encoding failed as expected: {e}")
    finally:
        conn.close()
        os.unlink(temp_file)

def test_latin1_encoding_succeeds():
    """Test that loading with Latin-1 encoding succeeds on Latin-1 file."""
    temp_file = create_test_csv_with_latin1()

    try:
        # Load with Latin-1 encoding (should succeed)
        conn = duckdb.connect(':memory:')
        result = conn.execute(f"""
            SELECT * FROM read_csv_auto('{temp_file}',
                header=true,
                auto_detect=true,
                ignore_errors=true,
                normalize_names=false,
                sample_size=-1,
                all_varchar=true,
                encoding='latin-1'
            )
        """).df()

        print(f"✓ Latin-1 encoding succeeded! Loaded {len(result)} rows")
        print(f"  Columns: {list(result.columns)}")
        print(f"  Sample data:")
        for i, row in result.iterrows():
            print(f"    {row['name']}, {row['value']}, {row['description']}")

    except Exception as e:
        print(f"✗ Latin-1 encoding failed: {e}")
    finally:
        conn.close()
        os.unlink(temp_file)

def test_default_encoding():
    """Test that default encoding (UTF-8) works for regular files."""
    # Create a regular UTF-8 CSV file
    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f:
        f.write("name,value\n")
        f.write("John,100\n")
        f.write("Jane,200\n")
        temp_file = f.name

    try:
        # Load with default encoding (should succeed)
        conn = duckdb.connect(':memory:')
        result = conn.execute(f"""
            SELECT * FROM read_csv_auto('{temp_file}',
                header=true,
                auto_detect=true,
                ignore_errors=true,
                normalize_names=false,
                sample_size=-1,
                all_varchar=true
            )
        """).df()

        print(f"✓ Default encoding succeeded! Loaded {len(result)} rows")
        print(f"  Sample data:")
        for i, row in result.iterrows():
            print(f"    {row['name']}, {row['value']}")

    except Exception as e:
        print(f"✗ Default encoding failed: {e}")
    finally:
        conn.close()
        os.unlink(temp_file)

if __name__ == "__main__":
    print("Testing DuckDB CSV encoding support...")
    print("=" * 50)

    test_default_encoding()
    print()

    test_utf8_encoding_fails()
    print()

    test_latin1_encoding_succeeds()
    print()

    print("=" * 50)
    print("Test completed!")