1use std::collections::HashMap;
16use std::io;
17use std::str::FromStr;
18use std::sync::Arc;
19use std::task::Poll;
20
21use arrow::csv::reader::Format;
22use arrow::csv::{self, WriterBuilder};
23use arrow::error::ArrowError;
24use arrow::record_batch::RecordBatch;
25use arrow_schema::{Schema, SchemaRef};
26use async_trait::async_trait;
27use bytes::{Buf, Bytes};
28use common_runtime;
29use common_telemetry::warn;
30use datafusion::physical_plan::SendableRecordBatchStream;
31use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
32use futures::StreamExt;
33use futures::stream::BoxStream;
34use object_store::ObjectStore;
35use snafu::ResultExt;
36use tokio_util::compat::FuturesAsyncReadCompatExt;
37use tokio_util::io::SyncIoBridge;
38
39use crate::buffered_writer::DfRecordBatchEncoder;
40use crate::compression::CompressionType;
41use crate::error::{self, Result};
42use crate::file_format::{self, FileFormat, stream_to_file};
43use crate::share_buffer::SharedBuffer;
44use crate::util::normalize_infer_schema;
45
46const SKIP_BAD_RECORDS_BATCH_SIZE: usize = 1;
47
48#[derive(Debug, Clone, PartialEq, Eq)]
49pub struct CsvFormat {
50 pub has_header: bool,
51 pub skip_bad_records: bool,
52 pub delimiter: u8,
53 pub schema_infer_max_record: Option<usize>,
54 pub compression_type: CompressionType,
55 pub timestamp_format: Option<String>,
56 pub time_format: Option<String>,
57 pub date_format: Option<String>,
58}
59
60impl TryFrom<&HashMap<String, String>> for CsvFormat {
61 type Error = error::Error;
62
63 fn try_from(value: &HashMap<String, String>) -> Result<Self> {
64 let mut format = CsvFormat::default();
65 if let Some(delimiter) = value.get(file_format::FORMAT_DELIMITER) {
66 format.delimiter = u8::from_str(delimiter).map_err(|_| {
68 error::ParseFormatSnafu {
69 key: file_format::FORMAT_DELIMITER,
70 value: delimiter,
71 }
72 .build()
73 })?;
74 };
75 if let Some(compression_type) = value.get(file_format::FORMAT_COMPRESSION_TYPE) {
76 format.compression_type = CompressionType::from_str(compression_type)?;
77 };
78 if let Some(schema_infer_max_record) =
79 value.get(file_format::FORMAT_SCHEMA_INFER_MAX_RECORD)
80 {
81 format.schema_infer_max_record =
82 Some(schema_infer_max_record.parse::<usize>().map_err(|_| {
83 error::ParseFormatSnafu {
84 key: file_format::FORMAT_SCHEMA_INFER_MAX_RECORD,
85 value: schema_infer_max_record,
86 }
87 .build()
88 })?);
89 };
90 let headers = value
91 .get(file_format::FORMAT_HEADERS)
92 .map(|headers| parse_bool(file_format::FORMAT_HEADERS, headers))
93 .transpose()?;
94 let has_header = value
95 .get(file_format::FORMAT_HAS_HEADER)
96 .map(|has_header| parse_bool(file_format::FORMAT_HAS_HEADER, has_header))
97 .transpose()?;
98 match (headers, has_header) {
99 (Some(headers), Some(has_header)) if headers != has_header => {
100 return error::ParseFormatSnafu {
101 key: file_format::FORMAT_HEADERS,
102 value: format!("headers={headers}, has_header={has_header}"),
103 }
104 .fail();
105 }
106 (Some(headers), _) => format.has_header = headers,
107 (_, Some(has_header)) => format.has_header = has_header,
108 _ => {}
109 }
110 if let Some(skip_bad_records) = value.get(file_format::FORMAT_SKIP_BAD_RECORDS) {
111 format.skip_bad_records =
112 parse_bool(file_format::FORMAT_SKIP_BAD_RECORDS, skip_bad_records)?;
113 };
114 if let Some(timestamp_format) = value.get(file_format::TIMESTAMP_FORMAT) {
115 format.timestamp_format = Some(timestamp_format.clone());
116 }
117 if let Some(time_format) = value.get(file_format::TIME_FORMAT) {
118 format.time_format = Some(time_format.clone());
119 }
120 if let Some(date_format) = value.get(file_format::DATE_FORMAT) {
121 format.date_format = Some(date_format.clone());
122 }
123 Ok(format)
124 }
125}
126
127fn parse_bool(key: &'static str, value: &str) -> Result<bool> {
128 value
129 .parse()
130 .map_err(|_| error::ParseFormatSnafu { key, value }.build())
131}
132
133impl Default for CsvFormat {
134 fn default() -> Self {
135 Self {
136 has_header: true,
137 skip_bad_records: false,
138 delimiter: b',',
139 schema_infer_max_record: Some(file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD),
140 compression_type: CompressionType::Uncompressed,
141 timestamp_format: None,
142 time_format: None,
143 date_format: None,
144 }
145 }
146}
147
148#[async_trait]
149impl FileFormat for CsvFormat {
150 async fn infer_schema(&self, store: &ObjectStore, path: &str) -> Result<Schema> {
151 let meta = store
152 .stat(path)
153 .await
154 .context(error::ReadObjectSnafu { path })?;
155
156 let reader = store
157 .reader(path)
158 .await
159 .context(error::ReadObjectSnafu { path })?
160 .into_futures_async_read(0..meta.content_length())
161 .await
162 .context(error::ReadObjectSnafu { path })?
163 .compat();
164
165 let decoded = self.compression_type.convert_async_read(reader);
166
167 let delimiter = self.delimiter;
168 let schema_infer_max_record = self.schema_infer_max_record;
169 let has_header = self.has_header;
170
171 common_runtime::spawn_blocking_global(move || {
172 let reader = SyncIoBridge::new(decoded);
173
174 let format = Format::default()
175 .with_delimiter(delimiter)
176 .with_header(has_header);
177 let (schema, _records_read) = format
178 .infer_schema(reader, schema_infer_max_record)
179 .context(error::InferSchemaSnafu)?;
180
181 Ok(normalize_infer_schema(schema))
182 })
183 .await
184 .context(error::JoinHandleSnafu)?
185 }
186}
187
188pub async fn stream_to_csv(
189 stream: SendableRecordBatchStream,
190 store: ObjectStore,
191 path: &str,
192 threshold: usize,
193 concurrency: usize,
194 format: &CsvFormat,
195) -> Result<usize> {
196 stream_to_file(
197 stream,
198 store,
199 path,
200 threshold,
201 concurrency,
202 format.compression_type,
203 |buffer| {
204 let mut builder = WriterBuilder::new();
205 if let Some(timestamp_format) = &format.timestamp_format {
206 builder = builder.with_timestamp_format(timestamp_format.to_owned())
207 }
208 if let Some(date_format) = &format.date_format {
209 builder = builder.with_date_format(date_format.to_owned())
210 }
211 if let Some(time_format) = &format.time_format {
212 builder = builder.with_time_format(time_format.to_owned())
213 }
214 builder.build(buffer)
215 },
216 )
217 .await
218}
219
220impl DfRecordBatchEncoder for csv::Writer<SharedBuffer> {
221 fn write(&mut self, batch: &RecordBatch) -> Result<()> {
222 self.write(batch).context(error::WriteRecordBatchSnafu)
223 }
224}
225
226pub async fn tolerant_csv_stream(
234 store: &ObjectStore,
235 path: &str,
236 schema: SchemaRef,
237 projection: Vec<usize>,
238 format: &CsvFormat,
239) -> Result<SendableRecordBatchStream> {
240 let meta = store
241 .stat(path)
242 .await
243 .context(error::ReadObjectSnafu { path })?;
244
245 let reader = store
246 .reader(path)
247 .await
248 .context(error::ReadObjectSnafu { path })?
249 .into_bytes_stream(0..meta.content_length())
250 .await
251 .context(error::ReadObjectSnafu { path })?;
252
253 let reader = format.compression_type.convert_stream(reader).boxed();
254 tolerant_csv_stream_from_reader(
255 reader,
256 path,
257 schema,
258 projection,
259 format.has_header,
260 format.delimiter,
261 )
262}
263
264fn tolerant_csv_stream_from_reader(
265 reader: BoxStream<'static, io::Result<Bytes>>,
266 path: &str,
267 schema: SchemaRef,
268 projection: Vec<usize>,
269 has_header: bool,
270 delimiter: u8,
271) -> Result<SendableRecordBatchStream> {
272 let projected_schema = Arc::new(
273 schema
274 .project(&projection)
275 .context(error::InferSchemaSnafu)?,
276 );
277 let mut decoder = csv::ReaderBuilder::new(schema)
278 .with_header(has_header)
279 .with_delimiter(delimiter)
280 .with_batch_size(SKIP_BAD_RECORDS_BATCH_SIZE)
281 .with_projection(projection)
282 .build_decoder();
283
284 let path = path.to_string();
285 let mut upstream = reader.fuse();
286 let mut buffered = Bytes::new();
287 let mut input_finished = false;
288 let stream = futures::stream::poll_fn(move |cx| {
289 loop {
290 while !input_finished {
291 if buffered.is_empty() {
292 match futures::ready!(upstream.poll_next_unpin(cx)) {
293 Some(Ok(bytes)) if bytes.is_empty() => continue,
294 Some(Ok(bytes)) => buffered = bytes,
295 Some(Err(error)) => return Poll::Ready(Some(Err(error.into()))),
296 None => input_finished = true,
297 }
298 }
299
300 let decoded = decoder.decode(buffered.as_ref())?;
301 if decoded > 0 {
302 buffered.advance(decoded);
303 continue;
304 }
305
306 if decoder.capacity() == 0 || input_finished {
307 break;
308 }
309
310 if buffered.is_empty() {
311 continue;
312 }
313
314 return Poll::Ready(Some(Err(ArrowError::ParseError(
315 "CSV decoder made no progress while input bytes remain".to_string(),
316 ))));
317 }
318
319 match decoder.flush() {
320 Ok(Some(batch)) => return Poll::Ready(Some(Ok(batch))),
321 Ok(None) if input_finished => return Poll::Ready(None),
322 Ok(None) => continue,
323 Err(error) if is_skippable_arrow_error(&error) => {
324 warn!(
325 "Skipping bad CSV record while copying from {}: {}",
326 path, error
327 );
328 }
329 Err(error) => return Poll::Ready(Some(Err(error))),
330 }
331 }
332 })
333 .map(|result: std::result::Result<RecordBatch, ArrowError>| result.map_err(Into::into));
334
335 Ok(Box::pin(RecordBatchStreamAdapter::new(
336 projected_schema,
337 stream,
338 )))
339}
340
341pub fn is_skippable_arrow_error(error: &ArrowError) -> bool {
342 matches!(
343 error,
344 ArrowError::ParseError(_)
345 | ArrowError::CastError(_)
346 | ArrowError::ComputeError(_)
347 | ArrowError::InvalidArgumentError(_)
348 )
349}
350
351#[cfg(test)]
352mod tests {
353 use std::sync::Arc;
354
355 use arrow_schema::{DataType, Field};
356 use common_recordbatch::adapter::DfRecordBatchStreamAdapter;
357 use common_recordbatch::{RecordBatch, RecordBatches};
358 use common_test_util::find_workspace_path;
359 use datafusion::datasource::physical_plan::{CsvSource, FileSource};
360 use datatypes::prelude::ConcreteDataType;
361 use datatypes::schema::{ColumnSchema, Schema};
362 use datatypes::vectors::{Float64Vector, StringVector, UInt32Vector, VectorRef};
363 use futures::TryStreamExt;
364
365 use super::*;
366 use crate::file_format::{
367 FORMAT_COMPRESSION_TYPE, FORMAT_DELIMITER, FORMAT_HAS_HEADER, FORMAT_HEADERS,
368 FORMAT_SCHEMA_INFER_MAX_RECORD, FORMAT_SKIP_BAD_RECORDS, FileFormat, file_to_stream,
369 };
370 use crate::test_util::{format_schema, test_store};
371
372 fn test_data_root() -> String {
373 find_workspace_path("/src/common/datasource/tests/csv")
374 .display()
375 .to_string()
376 }
377
378 #[tokio::test]
379 async fn infer_schema_basic() {
380 let csv = CsvFormat::default();
381 let store = test_store(&test_data_root());
382 let schema = csv.infer_schema(&store, "simple.csv").await.unwrap();
383 let formatted: Vec<_> = format_schema(schema);
384
385 assert_eq!(
386 vec![
387 "c1: Utf8: NULL",
388 "c2: Int64: NULL",
389 "c3: Int64: NULL",
390 "c4: Int64: NULL",
391 "c5: Int64: NULL",
392 "c6: Int64: NULL",
393 "c7: Int64: NULL",
394 "c8: Int64: NULL",
395 "c9: Int64: NULL",
396 "c10: Utf8: NULL",
397 "c11: Float64: NULL",
398 "c12: Float64: NULL",
399 "c13: Utf8: NULL"
400 ],
401 formatted,
402 );
403 }
404
405 #[tokio::test]
406 async fn normalize_infer_schema() {
407 let csv = CsvFormat {
408 schema_infer_max_record: Some(3),
409 ..CsvFormat::default()
410 };
411 let store = test_store(&test_data_root());
412 let schema = csv.infer_schema(&store, "max_infer.csv").await.unwrap();
413 let formatted: Vec<_> = format_schema(schema);
414
415 assert_eq!(
416 vec![
417 "num: Int64: NULL",
418 "str: Utf8: NULL",
419 "ts: Utf8: NULL",
420 "t: Utf8: NULL",
421 "date: Date32: NULL"
422 ],
423 formatted,
424 );
425 }
426
427 #[tokio::test]
428 async fn infer_schema_with_limit() {
429 let csv = CsvFormat {
430 schema_infer_max_record: Some(3),
431 ..CsvFormat::default()
432 };
433 let store = test_store(&test_data_root());
434 let schema = csv
435 .infer_schema(&store, "schema_infer_limit.csv")
436 .await
437 .unwrap();
438 let formatted: Vec<_> = format_schema(schema);
439
440 assert_eq!(
441 vec![
442 "a: Int64: NULL",
443 "b: Float64: NULL",
444 "c: Int64: NULL",
445 "d: Int64: NULL"
446 ],
447 formatted
448 );
449
450 let csv = CsvFormat::default();
451 let store = test_store(&test_data_root());
452 let schema = csv
453 .infer_schema(&store, "schema_infer_limit.csv")
454 .await
455 .unwrap();
456 let formatted: Vec<_> = format_schema(schema);
457
458 assert_eq!(
459 vec![
460 "a: Int64: NULL",
461 "b: Float64: NULL",
462 "c: Int64: NULL",
463 "d: Utf8: NULL"
464 ],
465 formatted
466 );
467 }
468
469 #[test]
470 fn test_try_from() {
471 let map = HashMap::new();
472 let format: CsvFormat = CsvFormat::try_from(&map).unwrap();
473
474 assert_eq!(format, CsvFormat::default());
475
476 let map = HashMap::from([
477 (
478 FORMAT_SCHEMA_INFER_MAX_RECORD.to_string(),
479 "2000".to_string(),
480 ),
481 (FORMAT_COMPRESSION_TYPE.to_string(), "zstd".to_string()),
482 (FORMAT_DELIMITER.to_string(), b'\t'.to_string()),
483 (FORMAT_HAS_HEADER.to_string(), "false".to_string()),
484 ]);
485 let format = CsvFormat::try_from(&map).unwrap();
486
487 assert_eq!(
488 format,
489 CsvFormat {
490 compression_type: CompressionType::Zstd,
491 schema_infer_max_record: Some(2000),
492 delimiter: b'\t',
493 has_header: false,
494 skip_bad_records: false,
495 timestamp_format: None,
496 time_format: None,
497 date_format: None
498 }
499 );
500
501 let map = HashMap::from([(FORMAT_SKIP_BAD_RECORDS.to_string(), "true".to_string())]);
502 let format = CsvFormat::try_from(&map).unwrap();
503
504 assert_eq!(
505 format,
506 CsvFormat {
507 skip_bad_records: true,
508 ..CsvFormat::default()
509 }
510 );
511
512 let map = HashMap::from([(FORMAT_HEADERS.to_string(), "true".to_string())]);
513 let format = CsvFormat::try_from(&map).unwrap();
514 assert_eq!(format, CsvFormat::default());
515
516 let map = HashMap::from([(FORMAT_HEADERS.to_string(), "false".to_string())]);
517 let format = CsvFormat::try_from(&map).unwrap();
518 assert_eq!(
519 format,
520 CsvFormat {
521 has_header: false,
522 ..CsvFormat::default()
523 }
524 );
525
526 let map = HashMap::from([
527 (FORMAT_HEADERS.to_string(), "false".to_string()),
528 (FORMAT_HAS_HEADER.to_string(), "false".to_string()),
529 ]);
530 let format = CsvFormat::try_from(&map).unwrap();
531 assert_eq!(
532 format,
533 CsvFormat {
534 has_header: false,
535 ..CsvFormat::default()
536 }
537 );
538 }
539
540 #[test]
541 fn test_try_from_rejects_invalid_bool_options() {
542 let map = HashMap::from([(FORMAT_SKIP_BAD_RECORDS.to_string(), "yes".to_string())]);
543 assert!(CsvFormat::try_from(&map).is_err());
544
545 let map = HashMap::from([(FORMAT_HEADERS.to_string(), "yes".to_string())]);
546 assert!(CsvFormat::try_from(&map).is_err());
547 }
548
549 #[test]
550 fn test_try_from_rejects_conflicting_header_options() {
551 let map = HashMap::from([
552 (FORMAT_HEADERS.to_string(), "false".to_string()),
553 (FORMAT_HAS_HEADER.to_string(), "true".to_string()),
554 ]);
555 assert!(CsvFormat::try_from(&map).is_err());
556 }
557
558 #[tokio::test]
559 async fn test_compressed_csv() {
560 let column_schemas = vec![
562 ColumnSchema::new("id", ConcreteDataType::uint32_datatype(), false),
563 ColumnSchema::new("name", ConcreteDataType::string_datatype(), false),
564 ColumnSchema::new("value", ConcreteDataType::float64_datatype(), false),
565 ];
566 let schema = Arc::new(Schema::new(column_schemas));
567
568 let batch1_columns: Vec<VectorRef> = vec![
570 Arc::new(UInt32Vector::from_slice(vec![1, 2, 3])),
571 Arc::new(StringVector::from(vec!["Alice", "Bob", "Charlie"])),
572 Arc::new(Float64Vector::from_slice(vec![10.5, 20.3, 30.7])),
573 ];
574 let batch1 = RecordBatch::new(schema.clone(), batch1_columns).unwrap();
575
576 let batch2_columns: Vec<VectorRef> = vec![
577 Arc::new(UInt32Vector::from_slice(vec![4, 5, 6])),
578 Arc::new(StringVector::from(vec!["David", "Eva", "Frank"])),
579 Arc::new(Float64Vector::from_slice(vec![40.1, 50.2, 60.3])),
580 ];
581 let batch2 = RecordBatch::new(schema.clone(), batch2_columns).unwrap();
582
583 let batch3_columns: Vec<VectorRef> = vec![
584 Arc::new(UInt32Vector::from_slice(vec![7, 8, 9])),
585 Arc::new(StringVector::from(vec!["Grace", "Henry", "Ivy"])),
586 Arc::new(Float64Vector::from_slice(vec![70.4, 80.5, 90.6])),
587 ];
588 let batch3 = RecordBatch::new(schema.clone(), batch3_columns).unwrap();
589
590 let recordbatches = RecordBatches::try_new(schema, vec![batch1, batch2, batch3]).unwrap();
592
593 let compression_types = vec![
595 CompressionType::Gzip,
596 CompressionType::Bzip2,
597 CompressionType::Xz,
598 CompressionType::Zstd,
599 ];
600
601 let temp_dir = common_test_util::temp_dir::create_temp_dir("test_compressed_csv");
603 for compression_type in compression_types {
604 let format = CsvFormat {
605 compression_type,
606 ..CsvFormat::default()
607 };
608
609 let compressed_file_name =
611 format!("test_compressed_csv.{}", compression_type.file_extension());
612 let compressed_file_path = temp_dir.path().join(&compressed_file_name);
613 let compressed_file_path_str = compressed_file_path.to_str().unwrap();
614
615 let store = test_store("/");
617
618 let rows = stream_to_csv(
620 Box::pin(DfRecordBatchStreamAdapter::new(recordbatches.as_stream())),
621 store,
622 compressed_file_path_str,
623 1024,
624 1,
625 &format,
626 )
627 .await
628 .unwrap();
629
630 assert_eq!(rows, 9);
631
632 assert!(compressed_file_path.exists());
634 let file_size = std::fs::metadata(&compressed_file_path).unwrap().len();
635 assert!(file_size > 0);
636
637 let file_content = std::fs::read(&compressed_file_path).unwrap();
639 match compression_type {
642 CompressionType::Gzip => {
643 assert_eq!(file_content[0], 0x1f, "Gzip file should start with 0x1f");
645 assert_eq!(
646 file_content[1], 0x8b,
647 "Gzip file should have 0x8b as second byte"
648 );
649 }
650 CompressionType::Bzip2 => {
651 assert_eq!(file_content[0], b'B', "Bzip2 file should start with 'B'");
653 assert_eq!(
654 file_content[1], b'Z',
655 "Bzip2 file should have 'Z' as second byte"
656 );
657 }
658 CompressionType::Xz => {
659 assert_eq!(file_content[0], 0xFD, "XZ file should start with 0xFD");
661 }
662 CompressionType::Zstd => {
663 assert_eq!(file_content[0], 0x28, "Zstd file should start with 0x28");
665 assert_eq!(
666 file_content[1], 0xB5,
667 "Zstd file should have 0xB5 as second byte"
668 );
669 }
670 _ => {}
671 }
672
673 let store = test_store("/");
675 let schema = Arc::new(
676 CsvFormat {
677 compression_type,
678 ..Default::default()
679 }
680 .infer_schema(&store, compressed_file_path_str)
681 .await
682 .unwrap(),
683 );
684 let csv_source = CsvSource::new(schema).with_batch_size(8192);
685
686 let stream = file_to_stream(
687 &store,
688 compressed_file_path_str,
689 csv_source.clone(),
690 None,
691 compression_type,
692 )
693 .await
694 .unwrap();
695
696 let batches = stream.try_collect::<Vec<_>>().await.unwrap();
697 let pretty_print = arrow::util::pretty::pretty_format_batches(&batches)
698 .unwrap()
699 .to_string();
700 let expected = r#"+----+---------+-------+
701| id | name | value |
702+----+---------+-------+
703| 1 | Alice | 10.5 |
704| 2 | Bob | 20.3 |
705| 3 | Charlie | 30.7 |
706| 4 | David | 40.1 |
707| 5 | Eva | 50.2 |
708| 6 | Frank | 60.3 |
709| 7 | Grace | 70.4 |
710| 8 | Henry | 80.5 |
711| 9 | Ivy | 90.6 |
712+----+---------+-------+"#;
713 assert_eq!(expected, pretty_print);
714 }
715 }
716
717 #[tokio::test]
718 async fn test_tolerant_csv_stream_continues_after_parse_error() {
719 let temp_dir = common_test_util::temp_dir::create_temp_dir("test_tolerant_csv_stream");
720 let csv_file_path = temp_dir.path().join("input.csv");
721 std::fs::write(
722 &csv_file_path,
723 "id,name,value\n1,Alice,10.5\nbad,Bad,20.0\nworse,Bad,21.0\n2,Bob,30.5",
724 )
725 .unwrap();
726
727 let store = test_store("/");
728 let schema = Arc::new(arrow_schema::Schema::new(vec![
729 Field::new("id", DataType::UInt32, false),
730 Field::new("name", DataType::Utf8, false),
731 Field::new("value", DataType::Float64, false),
732 ]));
733 let path = csv_file_path.to_str().unwrap();
734
735 let stream =
736 tolerant_csv_stream(&store, path, schema, vec![0, 1, 2], &CsvFormat::default())
737 .await
738 .unwrap();
739 let batches = stream.try_collect::<Vec<_>>().await.unwrap();
740 let pretty_print = arrow::util::pretty::pretty_format_batches(&batches)
741 .unwrap()
742 .to_string();
743 let expected = r#"+----+-------+-------+
744| id | name | value |
745+----+-------+-------+
746| 1 | Alice | 10.5 |
747| 2 | Bob | 30.5 |
748+----+-------+-------+"#;
749 assert_eq!(expected, pretty_print);
750 }
751
752 #[tokio::test]
753 async fn test_tolerant_csv_stream_fails_on_structural_csv_error() {
754 let temp_dir =
755 common_test_util::temp_dir::create_temp_dir("test_tolerant_csv_stream_csv_error");
756 let csv_file_path = temp_dir.path().join("input.csv");
757 std::fs::write(&csv_file_path, "id,name,value\n1,Alice,10.5\n2,Bob\n").unwrap();
758
759 let store = test_store("/");
760 let schema = Arc::new(arrow_schema::Schema::new(vec![
761 Field::new("id", DataType::UInt32, false),
762 Field::new("name", DataType::Utf8, false),
763 Field::new("value", DataType::Float64, false),
764 ]));
765 let path = csv_file_path.to_str().unwrap();
766
767 let stream =
768 tolerant_csv_stream(&store, path, schema, vec![0, 1, 2], &CsvFormat::default())
769 .await
770 .unwrap();
771 let error = stream.try_collect::<Vec<_>>().await.unwrap_err();
772
773 assert!(error.to_string().contains("incorrect number of fields"));
774 }
775}