source: main/trunk/greenstone2/perllib/cpan/Text/CSV.pm@ 33235

Last change on this file since 33235 was 33235, checked in by davidb, 5 years ago

CPAN module for processing CSV files

File size: 81.9 KB
Line 
1package Text::CSV;
2
3
4use strict;
5use Exporter;
6use Carp ();
7use vars qw( $VERSION $DEBUG @ISA @EXPORT_OK );
8@ISA = qw( Exporter );
9@EXPORT_OK = qw( csv );
10
11BEGIN {
12 $VERSION = '1.99';
13 $DEBUG = 0;
14}
15
16# if use CSV_XS, requires version
17my $Module_XS = 'Text::CSV_XS';
18my $Module_PP = 'Text::CSV_PP';
19my $XS_Version = '1.02';
20
21my $Is_Dynamic = 0;
22
23my @PublicMethods = qw/
24 version error_diag error_input
25 known_attributes csv
26 PV IV NV
27/;
28#
29
30# Check the environment variable to decide worker module.
31
32unless ($Text::CSV::Worker) {
33 $Text::CSV::DEBUG and Carp::carp("Check used worker module...");
34
35 if ( exists $ENV{PERL_TEXT_CSV} ) {
36 if ($ENV{PERL_TEXT_CSV} eq '0' or $ENV{PERL_TEXT_CSV} eq 'Text::CSV_PP') {
37 _load_pp() or Carp::croak $@;
38 }
39 elsif ($ENV{PERL_TEXT_CSV} eq '1' or $ENV{PERL_TEXT_CSV} =~ /Text::CSV_XS\s*,\s*Text::CSV_PP/) {
40 _load_xs() or _load_pp() or Carp::croak $@;
41 }
42 elsif ($ENV{PERL_TEXT_CSV} eq '2' or $ENV{PERL_TEXT_CSV} eq 'Text::CSV_XS') {
43 _load_xs() or Carp::croak $@;
44 }
45 else {
46 Carp::croak "The value of environmental variable 'PERL_TEXT_CSV' is invalid.";
47 }
48 }
49 else {
50 _load_xs() or _load_pp() or Carp::croak $@;
51 }
52
53}
54
55sub new { # normal mode
56 my $proto = shift;
57 my $class = ref($proto) || $proto;
58
59 unless ( $proto ) { # for Text::CSV_XS/PP::new(0);
60 return eval qq| $Text::CSV::Worker\::new( \$proto ) |;
61 }
62
63 #if (ref $_[0] and $_[0]->{module}) {
64 # Carp::croak("Can't set 'module' in non dynamic mode.");
65 #}
66
67 if ( my $obj = $Text::CSV::Worker->new(@_) ) {
68 $obj->{_MODULE} = $Text::CSV::Worker;
69 bless $obj, $class;
70 return $obj;
71 }
72 else {
73 return;
74 }
75
76
77}
78
79
80sub require_xs_version { $XS_Version; }
81
82
83sub module {
84 my $proto = shift;
85 return !ref($proto) ? $Text::CSV::Worker
86 : ref($proto->{_MODULE}) ? ref($proto->{_MODULE}) : $proto->{_MODULE};
87}
88
89*backend = *module;
90
91
92sub is_xs {
93 return $_[0]->module eq $Module_XS;
94}
95
96
97sub is_pp {
98 return $_[0]->module eq $Module_PP;
99}
100
101
102sub is_dynamic { $Is_Dynamic; }
103
104sub _load_xs { _load($Module_XS, $XS_Version) }
105
106sub _load_pp { _load($Module_PP) }
107
108sub _load {
109 my ($module, $version) = @_;
110 $version ||= '';
111
112 $Text::CSV::DEBUG and Carp::carp "Load $module.";
113
114 eval qq| use $module $version |;
115
116 return if $@;
117
118 push @Text::CSV::ISA, $module;
119 $Text::CSV::Worker = $module;
120
121 local $^W;
122 no strict qw(refs);
123
124 for my $method (@PublicMethods) {
125 *{"Text::CSV::$method"} = \&{"$module\::$method"};
126 }
127 return 1;
128}
129
130
131
1321;
133__END__
134
135=pod
136
137=head1 NAME
138
139Text::CSV - comma-separated values manipulator (using XS or PurePerl)
140
141
142=head1 SYNOPSIS
143
144This section is taken from Text::CSV_XS.
145
146 # Functional interface
147 use Text::CSV qw( csv );
148
149 # Read whole file in memory
150 my $aoa = csv (in => "data.csv"); # as array of array
151 my $aoh = csv (in => "data.csv",
152 headers => "auto"); # as array of hash
153
154 # Write array of arrays as csv file
155 csv (in => $aoa, out => "file.csv", sep_char=> ";");
156
157 # Only show lines where "code" is odd
158 csv (in => "data.csv", filter => { code => sub { $_ % 2 }});
159
160 # Object interface
161 use Text::CSV;
162
163 my @rows;
164 # Read/parse CSV
165 my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
166 open my $fh, "<:encoding(utf8)", "test.csv" or die "test.csv: $!";
167 while (my $row = $csv->getline ($fh)) {
168 $row->[2] =~ m/pattern/ or next; # 3rd field should match
169 push @rows, $row;
170 }
171 close $fh;
172
173 # and write as CSV
174 open $fh, ">:encoding(utf8)", "new.csv" or die "new.csv: $!";
175 $csv->say ($fh, $_) for @rows;
176 close $fh or die "new.csv: $!";
177
178=head1 DESCRIPTION
179
180Text::CSV is a thin wrapper for L<Text::CSV_XS>-compatible modules now.
181All the backend modules provide facilities for the composition and
182decomposition of comma-separated values. Text::CSV uses Text::CSV_XS
183by default, and when Text::CSV_XS is not available, falls back on
184L<Text::CSV_PP>, which is bundled in the same distribution as this module.
185
186=head1 CHOOSING BACKEND
187
188This module respects an environmental variable called C<PERL_TEXT_CSV>
189when it decides a backend module to use. If this environmental variable
190is not set, it tries to load Text::CSV_XS, and if Text::CSV_XS is not
191available, falls back on Text::CSV_PP;
192
193If you always don't want it to fall back on Text::CSV_PP, set the variable
194like this (C<export> may be C<setenv>, C<set> and the likes, depending
195on your environment):
196
197 > export PERL_TEXT_CSV=Text::CSV_XS
198
199If you prefer Text::CSV_XS to Text::CSV_PP (default), then:
200
201 > export PERL_TEXT_CSV=Text::CSV_XS,Text::CSV_PP
202
203You may also want to set this variable at the top of your test files, in order
204not to be bothered with incompatibilities between backends (you need to wrap
205this in C<BEGIN>, and set before actually C<use>-ing Text::CSV module, as it
206decides its backend as soon as it's loaded):
207
208 BEGIN { $ENV{PERL_TEXT_CSV}='Text::CSV_PP'; }
209 use Text::CSV;
210
211=head1 NOTES
212
213This section is also taken from Text::CSV_XS.
214
215=head2 Embedded newlines
216
217B<Important Note>: The default behavior is to accept only ASCII characters
218in the range from C<0x20> (space) to C<0x7E> (tilde). This means that the
219fields can not contain newlines. If your data contains newlines embedded in
220fields, or characters above C<0x7E> (tilde), or binary data, you B<I<must>>
221set C<< binary => 1 >> in the call to L</new>. To cover the widest range of
222parsing options, you will always want to set binary.
223
224But you still have the problem that you have to pass a correct line to the
225L</parse> method, which is more complicated from the usual point of usage:
226
227 my $csv = Text::CSV->new ({ binary => 1, eol => $/ });
228 while (<>) { # WRONG!
229 $csv->parse ($_);
230 my @fields = $csv->fields ();
231 }
232
233this will break, as the C<while> might read broken lines: it does not care
234about the quoting. If you need to support embedded newlines, the way to go
235is to B<not> pass L<C<eol>|/eol> in the parser (it accepts C<\n>, C<\r>,
236B<and> C<\r\n> by default) and then
237
238 my $csv = Text::CSV->new ({ binary => 1 });
239 open my $fh, "<", $file or die "$file: $!";
240 while (my $row = $csv->getline ($fh)) {
241 my @fields = @$row;
242 }
243
244The old(er) way of using global file handles is still supported
245
246 while (my $row = $csv->getline (*ARGV)) { ... }
247
248=head2 Unicode
249
250Unicode is only tested to work with perl-5.8.2 and up.
251
252See also L</BOM>.
253
254The simplest way to ensure the correct encoding is used for in- and output
255is by either setting layers on the filehandles, or setting the L</encoding>
256argument for L</csv>.
257
258 open my $fh, "<:encoding(UTF-8)", "in.csv" or die "in.csv: $!";
259or
260 my $aoa = csv (in => "in.csv", encoding => "UTF-8");
261
262 open my $fh, ">:encoding(UTF-8)", "out.csv" or die "out.csv: $!";
263or
264 csv (in => $aoa, out => "out.csv", encoding => "UTF-8");
265
266On parsing (both for L</getline> and L</parse>), if the source is marked
267being UTF8, then all fields that are marked binary will also be marked UTF8.
268
269On combining (L</print> and L</combine>): if any of the combining fields
270was marked UTF8, the resulting string will be marked as UTF8. Note however
271that all fields I<before> the first field marked UTF8 and contained 8-bit
272characters that were not upgraded to UTF8, these will be C<bytes> in the
273resulting string too, possibly causing unexpected errors. If you pass data
274of different encoding, or you don't know if there is different encoding,
275force it to be upgraded before you pass them on:
276
277 $csv->print ($fh, [ map { utf8::upgrade (my $x = $_); $x } @data ]);
278
279For complete control over encoding, please use L<Text::CSV::Encoded>:
280
281 use Text::CSV::Encoded;
282 my $csv = Text::CSV::Encoded->new ({
283 encoding_in => "iso-8859-1", # the encoding comes into Perl
284 encoding_out => "cp1252", # the encoding comes out of Perl
285 });
286
287 $csv = Text::CSV::Encoded->new ({ encoding => "utf8" });
288 # combine () and print () accept *literally* utf8 encoded data
289 # parse () and getline () return *literally* utf8 encoded data
290
291 $csv = Text::CSV::Encoded->new ({ encoding => undef }); # default
292 # combine () and print () accept UTF8 marked data
293 # parse () and getline () return UTF8 marked data
294
295=head2 BOM
296
297BOM (or Byte Order Mark) handling is available only inside the L</header>
298method. This method supports the following encodings: C<utf-8>, C<utf-1>,
299C<utf-32be>, C<utf-32le>, C<utf-16be>, C<utf-16le>, C<utf-ebcdic>, C<scsu>,
300C<bocu-1>, and C<gb-18030>. See L<Wikipedia|https://en.wikipedia.org/wiki/Byte_order_mark>.
301
302If a file has a BOM, the easiest way to deal with that is
303
304 my $aoh = csv (in => $file, detect_bom => 1);
305
306All records will be encoded based on the detected BOM.
307
308This implies a call to the L</header> method, which defaults to also set
309the L</column_names>. So this is B<not> the same as
310
311 my $aoh = csv (in => $file, headers => "auto");
312
313which only reads the first record to set L</column_names> but ignores any
314meaning of possible present BOM.
315
316=head1 METHODS
317
318This section is also taken from Text::CSV_XS.
319
320=head2 version
321
322(Class method) Returns the current module version.
323
324=head2 new
325
326(Class method) Returns a new instance of class Text::CSV. The attributes
327are described by the (optional) hash ref C<\%attr>.
328
329 my $csv = Text::CSV->new ({ attributes ... });
330
331The following attributes are available:
332
333=head3 eol
334
335 my $csv = Text::CSV->new ({ eol => $/ });
336 $csv->eol (undef);
337 my $eol = $csv->eol;
338
339The end-of-line string to add to rows for L</print> or the record separator
340for L</getline>.
341
342When not passed in a B<parser> instance, the default behavior is to accept
343C<\n>, C<\r>, and C<\r\n>, so it is probably safer to not specify C<eol> at
344all. Passing C<undef> or the empty string behave the same.
345
346When not passed in a B<generating> instance, records are not terminated at
347all, so it is probably wise to pass something you expect. A safe choice for
348C<eol> on output is either C<$/> or C<\r\n>.
349
350Common values for C<eol> are C<"\012"> (C<\n> or Line Feed), C<"\015\012">
351(C<\r\n> or Carriage Return, Line Feed), and C<"\015"> (C<\r> or Carriage
352Return). The L<C<eol>|/eol> attribute cannot exceed 7 (ASCII) characters.
353
354If both C<$/> and L<C<eol>|/eol> equal C<"\015">, parsing lines that end on
355only a Carriage Return without Line Feed, will be L</parse>d correct.
356
357=head3 sep_char
358
359 my $csv = Text::CSV->new ({ sep_char => ";" });
360 $csv->sep_char (";");
361 my $c = $csv->sep_char;
362
363The char used to separate fields, by default a comma. (C<,>). Limited to a
364single-byte character, usually in the range from C<0x20> (space) to C<0x7E>
365(tilde). When longer sequences are required, use L<C<sep>|/sep>.
366
367The separation character can not be equal to the quote character or to the
368escape character.
369
370=head3 sep
371
372 my $csv = Text::CSV->new ({ sep => "\N{FULLWIDTH COMMA}" });
373 $csv->sep (";");
374 my $sep = $csv->sep;
375
376The chars used to separate fields, by default undefined. Limited to 8 bytes.
377
378When set, overrules L<C<sep_char>|/sep_char>. If its length is one byte it
379acts as an alias to L<C<sep_char>|/sep_char>.
380
381=head3 quote_char
382
383 my $csv = Text::CSV->new ({ quote_char => "'" });
384 $csv->quote_char (undef);
385 my $c = $csv->quote_char;
386
387The character to quote fields containing blanks or binary data, by default
388the double quote character (C<">). A value of undef suppresses quote chars
389(for simple cases only). Limited to a single-byte character, usually in the
390range from C<0x20> (space) to C<0x7E> (tilde). When longer sequences are
391required, use L<C<quote>|/quote>.
392
393C<quote_char> can not be equal to L<C<sep_char>|/sep_char>.
394
395=head3 quote
396
397 my $csv = Text::CSV->new ({ quote => "\N{FULLWIDTH QUOTATION MARK}" });
398 $csv->quote ("'");
399 my $quote = $csv->quote;
400
401The chars used to quote fields, by default undefined. Limited to 8 bytes.
402
403When set, overrules L<C<quote_char>|/quote_char>. If its length is one byte
404it acts as an alias to L<C<quote_char>|/quote_char>.
405
406=head3 escape_char
407
408 my $csv = Text::CSV->new ({ escape_char => "\\" });
409 $csv->escape_char (":");
410 my $c = $csv->escape_char;
411
412The character to escape certain characters inside quoted fields. This is
413limited to a single-byte character, usually in the range from C<0x20>
414(space) to C<0x7E> (tilde).
415
416The C<escape_char> defaults to being the double-quote mark (C<">). In other
417words the same as the default L<C<quote_char>|/quote_char>. This means that
418doubling the quote mark in a field escapes it:
419
420 "foo","bar","Escape ""quote mark"" with two ""quote marks""","baz"
421
422If you change the L<C<quote_char>|/quote_char> without changing the
423C<escape_char>, the C<escape_char> will still be the double-quote (C<">).
424If instead you want to escape the L<C<quote_char>|/quote_char> by doubling
425it you will need to also change the C<escape_char> to be the same as what
426you have changed the L<C<quote_char>|/quote_char> to.
427
428Setting C<escape_char> to <undef> or C<""> will disable escaping completely
429and is greatly discouraged. This will also disable C<escape_null>.
430
431The escape character can not be equal to the separation character.
432
433=head3 binary
434
435 my $csv = Text::CSV->new ({ binary => 1 });
436 $csv->binary (0);
437 my $f = $csv->binary;
438
439If this attribute is C<1>, you may use binary characters in quoted fields,
440including line feeds, carriage returns and C<NULL> bytes. (The latter could
441be escaped as C<"0>.) By default this feature is off.
442
443If a string is marked UTF8, C<binary> will be turned on automatically when
444binary characters other than C<CR> and C<NL> are encountered. Note that a
445simple string like C<"\x{00a0}"> might still be binary, but not marked UTF8,
446so setting C<< { binary => 1 } >> is still a wise option.
447
448=head3 strict
449
450 my $csv = Text::CSV->new ({ strict => 1 });
451 $csv->strict (0);
452 my $f = $csv->strict;
453
454If this attribute is set to C<1>, any row that parses to a different number
455of fields than the previous row will cause the parser to throw error 2014.
456
457=head3 formula_handling
458
459=head3 formula
460
461 my $csv = Text::CSV->new ({ formula => "none" });
462 $csv->formula ("none");
463 my $f = $csv->formula;
464
465This defines the behavior of fields containing I<formulas>. As formulas are
466considered dangerous in spreadsheets, this attribute can define an optional
467action to be taken if a field starts with an equal sign (C<=>).
468
469For purpose of code-readability, this can also be written as
470
471 my $csv = Text::CSV->new ({ formula_handling => "none" });
472 $csv->formula_handling ("none");
473 my $f = $csv->formula_handling;
474
475Possible values for this attribute are
476
477=over 2
478
479=item none
480
481Take no specific action. This is the default.
482
483 $csv->formula ("none");
484
485=item die
486
487Cause the process to C<die> whenever a leading C<=> is encountered.
488
489 $csv->formula ("die");
490
491=item croak
492
493Cause the process to C<croak> whenever a leading C<=> is encountered. (See
494L<Carp>)
495
496 $csv->formula ("croak");
497
498=item diag
499
500Report position and content of the field whenever a leading C<=> is found.
501The value of the field is unchanged.
502
503 $csv->formula ("diag");
504
505=item empty
506
507Replace the content of fields that start with a C<=> with the empty string.
508
509 $csv->formula ("empty");
510 $csv->formula ("");
511
512=item undef
513
514Replace the content of fields that start with a C<=> with C<undef>.
515
516 $csv->formula ("undef");
517 $csv->formula (undef);
518
519=back
520
521All other values will give a warning and then fallback to C<diag>.
522
523=head3 decode_utf8
524
525 my $csv = Text::CSV->new ({ decode_utf8 => 1 });
526 $csv->decode_utf8 (0);
527 my $f = $csv->decode_utf8;
528
529This attributes defaults to TRUE.
530
531While I<parsing>, fields that are valid UTF-8, are automatically set to be
532UTF-8, so that
533
534 $csv->parse ("\xC4\xA8\n");
535
536results in
537
538 PV("\304\250"\0) [UTF8 "\x{128}"]
539
540Sometimes it might not be a desired action. To prevent those upgrades, set
541this attribute to false, and the result will be
542
543 PV("\304\250"\0)
544
545=head3 auto_diag
546
547 my $csv = Text::CSV->new ({ auto_diag => 1 });
548 $csv->auto_diag (2);
549 my $l = $csv->auto_diag;
550
551Set this attribute to a number between C<1> and C<9> causes L</error_diag>
552to be automatically called in void context upon errors.
553
554In case of error C<2012 - EOF>, this call will be void.
555
556If C<auto_diag> is set to a numeric value greater than C<1>, it will C<die>
557on errors instead of C<warn>. If set to anything unrecognized, it will be
558silently ignored.
559
560Future extensions to this feature will include more reliable auto-detection
561of C<autodie> being active in the scope of which the error occurred which
562will increment the value of C<auto_diag> with C<1> the moment the error is
563detected.
564
565=head3 diag_verbose
566
567 my $csv = Text::CSV->new ({ diag_verbose => 1 });
568 $csv->diag_verbose (2);
569 my $l = $csv->diag_verbose;
570
571Set the verbosity of the output triggered by C<auto_diag>. Currently only
572adds the current input-record-number (if known) to the diagnostic output
573with an indication of the position of the error.
574
575=head3 blank_is_undef
576
577 my $csv = Text::CSV->new ({ blank_is_undef => 1 });
578 $csv->blank_is_undef (0);
579 my $f = $csv->blank_is_undef;
580
581Under normal circumstances, C<CSV> data makes no distinction between quoted-
582and unquoted empty fields. These both end up in an empty string field once
583read, thus
584
585 1,"",," ",2
586
587is read as
588
589 ("1", "", "", " ", "2")
590
591When I<writing> C<CSV> files with either L<C<always_quote>|/always_quote>
592or L<C<quote_empty>|/quote_empty> set, the unquoted I<empty> field is the
593result of an undefined value. To enable this distinction when I<reading>
594C<CSV> data, the C<blank_is_undef> attribute will cause unquoted empty
595fields to be set to C<undef>, causing the above to be parsed as
596
597 ("1", "", undef, " ", "2")
598
599note that this is specifically important when loading C<CSV> fields into a
600database that allows C<NULL> values, as the perl equivalent for C<NULL> is
601C<undef> in L<DBI> land.
602
603=head3 empty_is_undef
604
605 my $csv = Text::CSV->new ({ empty_is_undef => 1 });
606 $csv->empty_is_undef (0);
607 my $f = $csv->empty_is_undef;
608
609Going one step further than L<C<blank_is_undef>|/blank_is_undef>, this
610attribute converts all empty fields to C<undef>, so
611
612 1,"",," ",2
613
614is read as
615
616 (1, undef, undef, " ", 2)
617
618Note that this effects only fields that are originally empty, not fields
619that are empty after stripping allowed whitespace. YMMV.
620
621=head3 allow_whitespace
622
623 my $csv = Text::CSV->new ({ allow_whitespace => 1 });
624 $csv->allow_whitespace (0);
625 my $f = $csv->allow_whitespace;
626
627When this option is set to true, the whitespace (C<TAB>'s and C<SPACE>'s)
628surrounding the separation character is removed when parsing. If either
629C<TAB> or C<SPACE> is one of the three characters L<C<sep_char>|/sep_char>,
630L<C<quote_char>|/quote_char>, or L<C<escape_char>|/escape_char> it will not
631be considered whitespace.
632
633Now lines like:
634
635 1 , "foo" , bar , 3 , zapp
636
637are parsed as valid C<CSV>, even though it violates the C<CSV> specs.
638
639Note that B<all> whitespace is stripped from both start and end of each
640field. That would make it I<more> than a I<feature> to enable parsing bad
641C<CSV> lines, as
642
643 1, 2.0, 3, ape , monkey
644
645will now be parsed as
646
647 ("1", "2.0", "3", "ape", "monkey")
648
649even if the original line was perfectly acceptable C<CSV>.
650
651=head3 allow_loose_quotes
652
653 my $csv = Text::CSV->new ({ allow_loose_quotes => 1 });
654 $csv->allow_loose_quotes (0);
655 my $f = $csv->allow_loose_quotes;
656
657By default, parsing unquoted fields containing L<C<quote_char>|/quote_char>
658characters like
659
660 1,foo "bar" baz,42
661
662would result in parse error 2034. Though it is still bad practice to allow
663this format, we cannot help the fact that some vendors make their
664applications spit out lines styled this way.
665
666If there is B<really> bad C<CSV> data, like
667
668 1,"foo "bar" baz",42
669
670or
671
672 1,""foo bar baz"",42
673
674there is a way to get this data-line parsed and leave the quotes inside the
675quoted field as-is. This can be achieved by setting C<allow_loose_quotes>
676B<AND> making sure that the L<C<escape_char>|/escape_char> is I<not> equal
677to L<C<quote_char>|/quote_char>.
678
679=head3 allow_loose_escapes
680
681 my $csv = Text::CSV->new ({ allow_loose_escapes => 1 });
682 $csv->allow_loose_escapes (0);
683 my $f = $csv->allow_loose_escapes;
684
685Parsing fields that have L<C<escape_char>|/escape_char> characters that
686escape characters that do not need to be escaped, like:
687
688 my $csv = Text::CSV->new ({ escape_char => "\\" });
689 $csv->parse (qq{1,"my bar\'s",baz,42});
690
691would result in parse error 2025. Though it is bad practice to allow this
692format, this attribute enables you to treat all escape character sequences
693equal.
694
695=head3 allow_unquoted_escape
696
697 my $csv = Text::CSV->new ({ allow_unquoted_escape => 1 });
698 $csv->allow_unquoted_escape (0);
699 my $f = $csv->allow_unquoted_escape;
700
701A backward compatibility issue where L<C<escape_char>|/escape_char> differs
702from L<C<quote_char>|/quote_char> prevents L<C<escape_char>|/escape_char>
703to be in the first position of a field. If L<C<quote_char>|/quote_char> is
704equal to the default C<"> and L<C<escape_char>|/escape_char> is set to C<\>,
705this would be illegal:
706
707 1,\0,2
708
709Setting this attribute to C<1> might help to overcome issues with backward
710compatibility and allow this style.
711
712=head3 always_quote
713
714 my $csv = Text::CSV->new ({ always_quote => 1 });
715 $csv->always_quote (0);
716 my $f = $csv->always_quote;
717
718By default the generated fields are quoted only if they I<need> to be. For
719example, if they contain the separator character. If you set this attribute
720to C<1> then I<all> defined fields will be quoted. (C<undef> fields are not
721quoted, see L</blank_is_undef>). This makes it quite often easier to handle
722exported data in external applications.
723
724=head3 quote_space
725
726 my $csv = Text::CSV->new ({ quote_space => 1 });
727 $csv->quote_space (0);
728 my $f = $csv->quote_space;
729
730By default, a space in a field would trigger quotation. As no rule exists
731this to be forced in C<CSV>, nor any for the opposite, the default is true
732for safety. You can exclude the space from this trigger by setting this
733attribute to 0.
734
735=head3 quote_empty
736
737 my $csv = Text::CSV->new ({ quote_empty => 1 });
738 $csv->quote_empty (0);
739 my $f = $csv->quote_empty;
740
741By default the generated fields are quoted only if they I<need> to be. An
742empty (defined) field does not need quotation. If you set this attribute to
743C<1> then I<empty> defined fields will be quoted. (C<undef> fields are not
744quoted, see L</blank_is_undef>). See also L<C<always_quote>|/always_quote>.
745
746=head3 quote_binary
747
748 my $csv = Text::CSV->new ({ quote_binary => 1 });
749 $csv->quote_binary (0);
750 my $f = $csv->quote_binary;
751
752By default, all "unsafe" bytes inside a string cause the combined field to
753be quoted. By setting this attribute to C<0>, you can disable that trigger
754for bytes >= C<0x7F>.
755
756=head3 escape_null
757
758 my $csv = Text::CSV->new ({ escape_null => 1 });
759 $csv->escape_null (0);
760 my $f = $csv->escape_null;
761
762By default, a C<NULL> byte in a field would be escaped. This option enables
763you to treat the C<NULL> byte as a simple binary character in binary mode
764(the C<< { binary => 1 } >> is set). The default is true. You can prevent
765C<NULL> escapes by setting this attribute to C<0>.
766
767When the C<escape_char> attribute is set to undefined, this attribute will
768be set to false.
769
770The default setting will encode "=\x00=" as
771
772 "="0="
773
774With C<escape_null> set, this will result in
775
776 "=\x00="
777
778The default when using the C<csv> function is C<false>.
779
780For backward compatibility reasons, the deprecated old name C<quote_null>
781is still recognized.
782
783=head3 keep_meta_info
784
785 my $csv = Text::CSV->new ({ keep_meta_info => 1 });
786 $csv->keep_meta_info (0);
787 my $f = $csv->keep_meta_info;
788
789By default, the parsing of input records is as simple and fast as possible.
790However, some parsing information - like quotation of the original field -
791is lost in that process. Setting this flag to true enables retrieving that
792information after parsing with the methods L</meta_info>, L</is_quoted>,
793and L</is_binary> described below. Default is false for performance.
794
795If you set this attribute to a value greater than 9, than you can control
796output quotation style like it was used in the input of the the last parsed
797record (unless quotation was added because of other reasons).
798
799 my $csv = Text::CSV->new ({
800 binary => 1,
801 keep_meta_info => 1,
802 quote_space => 0,
803 });
804
805 my $row = $csv->parse (q{1,,"", ," ",f,"g","h""h",help,"help"});
806
807 $csv->print (*STDOUT, \@row);
808 # 1,,, , ,f,g,"h""h",help,help
809 $csv->keep_meta_info (11);
810 $csv->print (*STDOUT, \@row);
811 # 1,,"", ," ",f,"g","h""h",help,"help"
812
813=head3 undef_str
814
815 my $csv = Text::CSV->new ({ undef_str => "\\N" });
816 $csv->undef_str (undef);
817 my $s = $csv->undef_str;
818
819This attribute optionally defines the output of undefined fields. The value
820passed is not changed at all, so if it needs quotation, the quotation needs
821to be included in the value of the attribute. Use with caution, as passing
822a value like C<",",,,,"""> will for sure mess up your output. The default
823for this attribute is C<undef>, meaning no special treatment.
824
825This attribute is useful when exporting CSV data to be imported in custom
826loaders, like for MySQL, that recognize special sequences for C<NULL> data.
827
828This attribute has no meaning when parsing CSV data.
829
830=head3 verbatim
831
832 my $csv = Text::CSV->new ({ verbatim => 1 });
833 $csv->verbatim (0);
834 my $f = $csv->verbatim;
835
836This is a quite controversial attribute to set, but makes some hard things
837possible.
838
839The rationale behind this attribute is to tell the parser that the normally
840special characters newline (C<NL>) and Carriage Return (C<CR>) will not be
841special when this flag is set, and be dealt with as being ordinary binary
842characters. This will ease working with data with embedded newlines.
843
844When C<verbatim> is used with L</getline>, L</getline> auto-C<chomp>'s
845every line.
846
847Imagine a file format like
848
849 M^^Hans^Janssen^Klas 2\n2A^Ja^11-06-2007#\r\n
850
851where, the line ending is a very specific C<"#\r\n">, and the sep_char is a
852C<^> (caret). None of the fields is quoted, but embedded binary data is
853likely to be present. With the specific line ending, this should not be too
854hard to detect.
855
856By default, Text::CSV' parse function is instructed to only know about
857C<"\n"> and C<"\r"> to be legal line endings, and so has to deal with the
858embedded newline as a real C<end-of-line>, so it can scan the next line if
859binary is true, and the newline is inside a quoted field. With this option,
860we tell L</parse> to parse the line as if C<"\n"> is just nothing more than
861a binary character.
862
863For L</parse> this means that the parser has no more idea about line ending
864and L</getline> C<chomp>s line endings on reading.
865
866=head3 types
867
868A set of column types; the attribute is immediately passed to the L</types>
869method.
870
871=head3 callbacks
872
873See the L</Callbacks> section below.
874
875=head3 accessors
876
877To sum it up,
878
879 $csv = Text::CSV->new ();
880
881is equivalent to
882
883 $csv = Text::CSV->new ({
884 eol => undef, # \r, \n, or \r\n
885 sep_char => ',',
886 sep => undef,
887 quote_char => '"',
888 quote => undef,
889 escape_char => '"',
890 binary => 0,
891 decode_utf8 => 1,
892 auto_diag => 0,
893 diag_verbose => 0,
894 blank_is_undef => 0,
895 empty_is_undef => 0,
896 allow_whitespace => 0,
897 allow_loose_quotes => 0,
898 allow_loose_escapes => 0,
899 allow_unquoted_escape => 0,
900 always_quote => 0,
901 quote_empty => 0,
902 quote_space => 1,
903 escape_null => 1,
904 quote_binary => 1,
905 keep_meta_info => 0,
906 strict => 0,
907 formula => 0,
908 verbatim => 0,
909 undef_str => undef,
910 types => undef,
911 callbacks => undef,
912 });
913
914For all of the above mentioned flags, an accessor method is available where
915you can inquire the current value, or change the value
916
917 my $quote = $csv->quote_char;
918 $csv->binary (1);
919
920It is not wise to change these settings halfway through writing C<CSV> data
921to a stream. If however you want to create a new stream using the available
922C<CSV> object, there is no harm in changing them.
923
924If the L</new> constructor call fails, it returns C<undef>, and makes the
925fail reason available through the L</error_diag> method.
926
927 $csv = Text::CSV->new ({ ecs_char => 1 }) or
928 die "".Text::CSV->error_diag ();
929
930L</error_diag> will return a string like
931
932 "INI - Unknown attribute 'ecs_char'"
933
934=head2 known_attributes
935
936 @attr = Text::CSV->known_attributes;
937 @attr = Text::CSV::known_attributes;
938 @attr = $csv->known_attributes;
939
940This method will return an ordered list of all the supported attributes as
941described above. This can be useful for knowing what attributes are valid
942in classes that use or extend Text::CSV.
943
944=head2 print
945
946 $status = $csv->print ($fh, $colref);
947
948Similar to L</combine> + L</string> + L</print>, but much more efficient.
949It expects an array ref as input (not an array!) and the resulting string
950is not really created, but immediately written to the C<$fh> object,
951typically an IO handle or any other object that offers a L</print> method.
952
953For performance reasons C<print> does not create a result string, so all
954L</string>, L</status>, L</fields>, and L</error_input> methods will return
955undefined information after executing this method.
956
957If C<$colref> is C<undef> (explicit, not through a variable argument) and
958L</bind_columns> was used to specify fields to be printed, it is possible
959to make performance improvements, as otherwise data would have to be copied
960as arguments to the method call:
961
962 $csv->bind_columns (\($foo, $bar));
963 $status = $csv->print ($fh, undef);
964
965A short benchmark
966
967 my @data = ("aa" .. "zz");
968 $csv->bind_columns (\(@data));
969
970 $csv->print ($fh, [ @data ]); # 11800 recs/sec
971 $csv->print ($fh, \@data ); # 57600 recs/sec
972 $csv->print ($fh, undef ); # 48500 recs/sec
973
974=head2 say
975
976 $status = $csv->say ($fh, $colref);
977
978Like L<C<print>|/print>, but L<C<eol>|/eol> defaults to C<$\>.
979
980=head2 print_hr
981
982 $csv->print_hr ($fh, $ref);
983
984Provides an easy way to print a C<$ref> (as fetched with L</getline_hr>)
985provided the column names are set with L</column_names>.
986
987It is just a wrapper method with basic parameter checks over
988
989 $csv->print ($fh, [ map { $ref->{$_} } $csv->column_names ]);
990
991=head2 combine
992
993 $status = $csv->combine (@fields);
994
995This method constructs a C<CSV> record from C<@fields>, returning success
996or failure. Failure can result from lack of arguments or an argument that
997contains an invalid character. Upon success, L</string> can be called to
998retrieve the resultant C<CSV> string. Upon failure, the value returned by
999L</string> is undefined and L</error_input> could be called to retrieve the
1000invalid argument.
1001
1002=head2 string
1003
1004 $line = $csv->string ();
1005
1006This method returns the input to L</parse> or the resultant C<CSV> string
1007of L</combine>, whichever was called more recently.
1008
1009=head2 getline
1010
1011 $colref = $csv->getline ($fh);
1012
1013This is the counterpart to L</print>, as L</parse> is the counterpart to
1014L</combine>: it parses a row from the C<$fh> handle using the L</getline>
1015method associated with C<$fh> and parses this row into an array ref. This
1016array ref is returned by the function or C<undef> for failure. When C<$fh>
1017does not support C<getline>, you are likely to hit errors.
1018
1019When fields are bound with L</bind_columns> the return value is a reference
1020to an empty list.
1021
1022The L</string>, L</fields>, and L</status> methods are meaningless again.
1023
1024=head2 getline_all
1025
1026 $arrayref = $csv->getline_all ($fh);
1027 $arrayref = $csv->getline_all ($fh, $offset);
1028 $arrayref = $csv->getline_all ($fh, $offset, $length);
1029
1030This will return a reference to a list of L<getline ($fh)|/getline> results.
1031In this call, C<keep_meta_info> is disabled. If C<$offset> is negative, as
1032with C<splice>, only the last C<abs ($offset)> records of C<$fh> are taken
1033into consideration.
1034
1035Given a CSV file with 10 lines:
1036
1037 lines call
1038 ----- ---------------------------------------------------------
1039 0..9 $csv->getline_all ($fh) # all
1040 0..9 $csv->getline_all ($fh, 0) # all
1041 8..9 $csv->getline_all ($fh, 8) # start at 8
1042 - $csv->getline_all ($fh, 0, 0) # start at 0 first 0 rows
1043 0..4 $csv->getline_all ($fh, 0, 5) # start at 0 first 5 rows
1044 4..5 $csv->getline_all ($fh, 4, 2) # start at 4 first 2 rows
1045 8..9 $csv->getline_all ($fh, -2) # last 2 rows
1046 6..7 $csv->getline_all ($fh, -4, 2) # first 2 of last 4 rows
1047
1048=head2 getline_hr
1049
1050The L</getline_hr> and L</column_names> methods work together to allow you
1051to have rows returned as hashrefs. You must call L</column_names> first to
1052declare your column names.
1053
1054 $csv->column_names (qw( code name price description ));
1055 $hr = $csv->getline_hr ($fh);
1056 print "Price for $hr->{name} is $hr->{price} EUR\n";
1057
1058L</getline_hr> will croak if called before L</column_names>.
1059
1060Note that L</getline_hr> creates a hashref for every row and will be much
1061slower than the combined use of L</bind_columns> and L</getline> but still
1062offering the same ease of use hashref inside the loop:
1063
1064 my @cols = @{$csv->getline ($fh)};
1065 $csv->column_names (@cols);
1066 while (my $row = $csv->getline_hr ($fh)) {
1067 print $row->{price};
1068 }
1069
1070Could easily be rewritten to the much faster:
1071
1072 my @cols = @{$csv->getline ($fh)};
1073 my $row = {};
1074 $csv->bind_columns (\@{$row}{@cols});
1075 while ($csv->getline ($fh)) {
1076 print $row->{price};
1077 }
1078
1079Your mileage may vary for the size of the data and the number of rows. With
1080perl-5.14.2 the comparison for a 100_000 line file with 14 rows:
1081
1082 Rate hashrefs getlines
1083 hashrefs 1.00/s -- -76%
1084 getlines 4.15/s 313% --
1085
1086=head2 getline_hr_all
1087
1088 $arrayref = $csv->getline_hr_all ($fh);
1089 $arrayref = $csv->getline_hr_all ($fh, $offset);
1090 $arrayref = $csv->getline_hr_all ($fh, $offset, $length);
1091
1092This will return a reference to a list of L<getline_hr ($fh)|/getline_hr>
1093results. In this call, L<C<keep_meta_info>|/keep_meta_info> is disabled.
1094
1095=head2 parse
1096
1097 $status = $csv->parse ($line);
1098
1099This method decomposes a C<CSV> string into fields, returning success or
1100failure. Failure can result from a lack of argument or the given C<CSV>
1101string is improperly formatted. Upon success, L</fields> can be called to
1102retrieve the decomposed fields. Upon failure calling L</fields> will return
1103undefined data and L</error_input> can be called to retrieve the invalid
1104argument.
1105
1106You may use the L</types> method for setting column types. See L</types>'
1107description below.
1108
1109The C<$line> argument is supposed to be a simple scalar. Everything else is
1110supposed to croak and set error 1500.
1111
1112=head2 fragment
1113
1114This function tries to implement RFC7111 (URI Fragment Identifiers for the
1115text/csv Media Type) - http://tools.ietf.org/html/rfc7111
1116
1117 my $AoA = $csv->fragment ($fh, $spec);
1118
1119In specifications, C<*> is used to specify the I<last> item, a dash (C<->)
1120to indicate a range. All indices are C<1>-based: the first row or column
1121has index C<1>. Selections can be combined with the semi-colon (C<;>).
1122
1123When using this method in combination with L</column_names>, the returned
1124reference will point to a list of hashes instead of a list of lists. A
1125disjointed cell-based combined selection might return rows with different
1126number of columns making the use of hashes unpredictable.
1127
1128 $csv->column_names ("Name", "Age");
1129 my $AoH = $csv->fragment ($fh, "col=3;8");
1130
1131If the L</after_parse> callback is active, it is also called on every line
1132parsed and skipped before the fragment.
1133
1134=over 2
1135
1136=item row
1137
1138 row=4
1139 row=5-7
1140 row=6-*
1141 row=1-2;4;6-*
1142
1143=item col
1144
1145 col=2
1146 col=1-3
1147 col=4-*
1148 col=1-2;4;7-*
1149
1150=item cell
1151
1152In cell-based selection, the comma (C<,>) is used to pair row and column
1153
1154 cell=4,1
1155
1156The range operator (C<->) using C<cell>s can be used to define top-left and
1157bottom-right C<cell> location
1158
1159 cell=3,1-4,6
1160
1161The C<*> is only allowed in the second part of a pair
1162
1163 cell=3,2-*,2 # row 3 till end, only column 2
1164 cell=3,2-3,* # column 2 till end, only row 3
1165 cell=3,2-*,* # strip row 1 and 2, and column 1
1166
1167Cells and cell ranges may be combined with C<;>, possibly resulting in rows
1168with different number of columns
1169
1170 cell=1,1-2,2;3,3-4,4;1,4;4,1
1171
1172Disjointed selections will only return selected cells. The cells that are
1173not specified will not be included in the returned set, not even as
1174C<undef>. As an example given a C<CSV> like
1175
1176 11,12,13,...19
1177 21,22,...28,29
1178 : :
1179 91,...97,98,99
1180
1181with C<cell=1,1-2,2;3,3-4,4;1,4;4,1> will return:
1182
1183 11,12,14
1184 21,22
1185 33,34
1186 41,43,44
1187
1188Overlapping cell-specs will return those cells only once, So
1189C<cell=1,1-3,3;2,2-4,4;2,3;4,2> will return:
1190
1191 11,12,13
1192 21,22,23,24
1193 31,32,33,34
1194 42,43,44
1195
1196=back
1197
1198L<RFC7111|http://tools.ietf.org/html/rfc7111> does B<not> allow different
1199types of specs to be combined (either C<row> I<or> C<col> I<or> C<cell>).
1200Passing an invalid fragment specification will croak and set error 2013.
1201
1202=head2 column_names
1203
1204Set the "keys" that will be used in the L</getline_hr> calls. If no keys
1205(column names) are passed, it will return the current setting as a list.
1206
1207L</column_names> accepts a list of scalars (the column names) or a single
1208array_ref, so you can pass the return value from L</getline> too:
1209
1210 $csv->column_names ($csv->getline ($fh));
1211
1212L</column_names> does B<no> checking on duplicates at all, which might lead
1213to unexpected results. Undefined entries will be replaced with the string
1214C<"\cAUNDEF\cA">, so
1215
1216 $csv->column_names (undef, "", "name", "name");
1217 $hr = $csv->getline_hr ($fh);
1218
1219Will set C<< $hr->{"\cAUNDEF\cA"} >> to the 1st field, C<< $hr->{""} >> to
1220the 2nd field, and C<< $hr->{name} >> to the 4th field, discarding the 3rd
1221field.
1222
1223L</column_names> croaks on invalid arguments.
1224
1225=head2 header
1226
1227This method does NOT work in perl-5.6.x
1228
1229Parse the CSV header and set L<C<sep>|/sep>, column_names and encoding.
1230
1231 my @hdr = $csv->header ($fh);
1232 $csv->header ($fh, { sep_set => [ ";", ",", "|", "\t" ] });
1233 $csv->header ($fh, { detect_bom => 1, munge_column_names => "lc" });
1234
1235The first argument should be a file handle.
1236
1237This method resets some object properties, as it is supposed to be invoked
1238only once per file or stream. It will leave attributes C<column_names> and
1239C<bound_columns> alone of setting column names is disabled. Reading headers
1240on previously process objects might fail on perl-5.8.0 and older.
1241
1242Assuming that the file opened for parsing has a header, and the header does
1243not contain problematic characters like embedded newlines, read the first
1244line from the open handle then auto-detect whether the header separates the
1245column names with a character from the allowed separator list.
1246
1247If any of the allowed separators matches, and none of the I<other> allowed
1248separators match, set L<C<sep>|/sep> to that separator for the current
1249CSV instance and use it to parse the first line, map those to lowercase,
1250and use that to set the instance L</column_names>:
1251
1252 my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
1253 open my $fh, "<", "file.csv";
1254 binmode $fh; # for Windows
1255 $csv->header ($fh);
1256 while (my $row = $csv->getline_hr ($fh)) {
1257 ...
1258 }
1259
1260If the header is empty, contains more than one unique separator out of the
1261allowed set, contains empty fields, or contains identical fields (after
1262folding), it will croak with error 1010, 1011, 1012, or 1013 respectively.
1263
1264If the header contains embedded newlines or is not valid CSV in any other
1265way, this method will croak and leave the parse error untouched.
1266
1267A successful call to C<header> will always set the L<C<sep>|/sep> of the
1268C<$csv> object. This behavior can not be disabled.
1269
1270=head3 return value
1271
1272On error this method will croak.
1273
1274In list context, the headers will be returned whether they are used to set
1275L</column_names> or not.
1276
1277In scalar context, the instance itself is returned. B<Note>: the values as
1278found in the header will effectively be B<lost> if C<set_column_names> is
1279false.
1280
1281=head3 Options
1282
1283=over 2
1284
1285=item sep_set
1286
1287 $csv->header ($fh, { sep_set => [ ";", ",", "|", "\t" ] });
1288
1289The list of legal separators defaults to C<[ ";", "," ]> and can be changed
1290by this option. As this is probably the most often used option, it can be
1291passed on its own as an unnamed argument:
1292
1293 $csv->header ($fh, [ ";", ",", "|", "\t", "::", "\x{2063}" ]);
1294
1295Multi-byte sequences are allowed, both multi-character and Unicode. See
1296L<C<sep>|/sep>.
1297
1298=item detect_bom
1299
1300 $csv->header ($fh, { detect_bom => 1 });
1301
1302The default behavior is to detect if the header line starts with a BOM. If
1303the header has a BOM, use that to set the encoding of C<$fh>. This default
1304behavior can be disabled by passing a false value to C<detect_bom>.
1305
1306Supported encodings from BOM are: UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, and
1307UTF-32LE. BOM's also support UTF-1, UTF-EBCDIC, SCSU, BOCU-1, and GB-18030
1308but L<Encode> does not (yet). UTF-7 is not supported.
1309
1310If a supported BOM was detected as start of the stream, it is stored in the
1311abject attribute C<ENCODING>.
1312
1313 my $enc = $csv->{ENCODING};
1314
1315The encoding is used with C<binmode> on C<$fh>.
1316
1317If the handle was opened in a (correct) encoding, this method will B<not>
1318alter the encoding, as it checks the leading B<bytes> of the first line. In
1319case the stream starts with a decode BOM (C<U+FEFF>), C<{ENCODING}> will be
1320C<""> (empty) instead of the default C<undef>.
1321
1322=item munge_column_names
1323
1324This option offers the means to modify the column names into something that
1325is most useful to the application. The default is to map all column names
1326to lower case.
1327
1328 $csv->header ($fh, { munge_column_names => "lc" });
1329
1330The following values are available:
1331
1332 lc - lower case
1333 uc - upper case
1334 none - do not change
1335 \%hash - supply a mapping
1336 \&cb - supply a callback
1337
1338Literal:
1339
1340 $csv->header ($fh, { munge_column_names => "none" });
1341
1342Hash:
1343
1344 $csv->header ($fh, { munge_column_names => { foo => "sombrero" });
1345
1346if a value does not exist, the original value is used unchanged
1347
1348Callback:
1349
1350 $csv->header ($fh, { munge_column_names => sub { fc } });
1351 $csv->header ($fh, { munge_column_names => sub { "column_".$col++ } });
1352 $csv->header ($fh, { munge_column_names => sub { lc (s/\W+/_/gr) } });
1353
1354As this callback is called in a C<map>, you can use C<$_> directly.
1355
1356=item set_column_names
1357
1358 $csv->header ($fh, { set_column_names => 1 });
1359
1360The default is to set the instances column names using L</column_names> if
1361the method is successful, so subsequent calls to L</getline_hr> can return
1362a hash. Disable setting the header can be forced by using a false value for
1363this option.
1364
1365As described in L</return value> above, content is lost in scalar context.
1366
1367=back
1368
1369=head3 Validation
1370
1371When receiving CSV files from external sources, this method can be used to
1372protect against changes in the layout by restricting to known headers (and
1373typos in the header fields).
1374
1375 my %known = (
1376 "record key" => "c_rec",
1377 "rec id" => "c_rec",
1378 "id_rec" => "c_rec",
1379 "kode" => "code",
1380 "code" => "code",
1381 "vaule" => "value",
1382 "value" => "value",
1383 );
1384 my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
1385 open my $fh, "<", $source or die "$source: $!";
1386 $csv->header ($fh, { munge_column_names => sub {
1387 s/\s+$//;
1388 s/^\s+//;
1389 $known{lc $_} or die "Unknown column '$_' in $source";
1390 }});
1391 while (my $row = $csv->getline_hr ($fh)) {
1392 say join "\t", $row->{c_rec}, $row->{code}, $row->{value};
1393 }
1394
1395=head2 bind_columns
1396
1397Takes a list of scalar references to be used for output with L</print> or
1398to store in the fields fetched by L</getline>. When you do not pass enough
1399references to store the fetched fields in, L</getline> will fail with error
1400C<3006>. If you pass more than there are fields to return, the content of
1401the remaining references is left untouched.
1402
1403 $csv->bind_columns (\$code, \$name, \$price, \$description);
1404 while ($csv->getline ($fh)) {
1405 print "The price of a $name is \x{20ac} $price\n";
1406 }
1407
1408To reset or clear all column binding, call L</bind_columns> with the single
1409argument C<undef>. This will also clear column names.
1410
1411 $csv->bind_columns (undef);
1412
1413If no arguments are passed at all, L</bind_columns> will return the list of
1414current bindings or C<undef> if no binds are active.
1415
1416Note that in parsing with C<bind_columns>, the fields are set on the fly.
1417That implies that if the third field of a row causes an error (or this row
1418has just two fields where the previous row had more), the first two fields
1419already have been assigned the values of the current row, while the rest of
1420the fields will still hold the values of the previous row. If you want the
1421parser to fail in these cases, use the L<C<strict>|/strict> attribute.
1422
1423=head2 eof
1424
1425 $eof = $csv->eof ();
1426
1427If L</parse> or L</getline> was used with an IO stream, this method will
1428return true (1) if the last call hit end of file, otherwise it will return
1429false (''). This is useful to see the difference between a failure and end
1430of file.
1431
1432Note that if the parsing of the last line caused an error, C<eof> is still
1433true. That means that if you are I<not> using L</auto_diag>, an idiom like
1434
1435 while (my $row = $csv->getline ($fh)) {
1436 # ...
1437 }
1438 $csv->eof or $csv->error_diag;
1439
1440will I<not> report the error. You would have to change that to
1441
1442 while (my $row = $csv->getline ($fh)) {
1443 # ...
1444 }
1445 +$csv->error_diag and $csv->error_diag;
1446
1447=head2 types
1448
1449 $csv->types (\@tref);
1450
1451This method is used to force that (all) columns are of a given type. For
1452example, if you have an integer column, two columns with doubles and a
1453string column, then you might do a
1454
1455 $csv->types ([Text::CSV::IV (),
1456 Text::CSV::NV (),
1457 Text::CSV::NV (),
1458 Text::CSV::PV ()]);
1459
1460Column types are used only for I<decoding> columns while parsing, in other
1461words by the L</parse> and L</getline> methods.
1462
1463You can unset column types by doing a
1464
1465 $csv->types (undef);
1466
1467or fetch the current type settings with
1468
1469 $types = $csv->types ();
1470
1471=over 4
1472
1473=item IV
1474
1475Set field type to integer.
1476
1477=item NV
1478
1479Set field type to numeric/float.
1480
1481=item PV
1482
1483Set field type to string.
1484
1485=back
1486
1487=head2 fields
1488
1489 @columns = $csv->fields ();
1490
1491This method returns the input to L</combine> or the resultant decomposed
1492fields of a successful L</parse>, whichever was called more recently.
1493
1494Note that the return value is undefined after using L</getline>, which does
1495not fill the data structures returned by L</parse>.
1496
1497=head2 meta_info
1498
1499 @flags = $csv->meta_info ();
1500
1501This method returns the "flags" of the input to L</combine> or the flags of
1502the resultant decomposed fields of L</parse>, whichever was called more
1503recently.
1504
1505For each field, a meta_info field will hold flags that inform something
1506about the field returned by the L</fields> method or passed to the
1507L</combine> method. The flags are bit-wise-C<or>'d like:
1508
1509=over 2
1510
1511=item C< >0x0001
1512
1513The field was quoted.
1514
1515=item C< >0x0002
1516
1517The field was binary.
1518
1519=back
1520
1521See the C<is_***> methods below.
1522
1523=head2 is_quoted
1524
1525 my $quoted = $csv->is_quoted ($column_idx);
1526
1527Where C<$column_idx> is the (zero-based) index of the column in the last
1528result of L</parse>.
1529
1530This returns a true value if the data in the indicated column was enclosed
1531in L<C<quote_char>|/quote_char> quotes. This might be important for fields
1532where content C<,20070108,> is to be treated as a numeric value, and where
1533C<,"20070108",> is explicitly marked as character string data.
1534
1535This method is only valid when L</keep_meta_info> is set to a true value.
1536
1537=head2 is_binary
1538
1539 my $binary = $csv->is_binary ($column_idx);
1540
1541Where C<$column_idx> is the (zero-based) index of the column in the last
1542result of L</parse>.
1543
1544This returns a true value if the data in the indicated column contained any
1545byte in the range C<[\x00-\x08,\x10-\x1F,\x7F-\xFF]>.
1546
1547This method is only valid when L</keep_meta_info> is set to a true value.
1548
1549=head2 is_missing
1550
1551 my $missing = $csv->is_missing ($column_idx);
1552
1553Where C<$column_idx> is the (zero-based) index of the column in the last
1554result of L</getline_hr>.
1555
1556 $csv->keep_meta_info (1);
1557 while (my $hr = $csv->getline_hr ($fh)) {
1558 $csv->is_missing (0) and next; # This was an empty line
1559 }
1560
1561When using L</getline_hr>, it is impossible to tell if the parsed fields
1562are C<undef> because they where not filled in the C<CSV> stream or because
1563they were not read at all, as B<all> the fields defined by L</column_names>
1564are set in the hash-ref. If you still need to know if all fields in each
1565row are provided, you should enable L<C<keep_meta_info>|/keep_meta_info> so
1566you can check the flags.
1567
1568If L<C<keep_meta_info>|/keep_meta_info> is C<false>, C<is_missing> will
1569always return C<undef>, regardless of C<$column_idx> being valid or not. If
1570this attribute is C<true> it will return either C<0> (the field is present)
1571or C<1> (the field is missing).
1572
1573A special case is the empty line. If the line is completely empty - after
1574dealing with the flags - this is still a valid CSV line: it is a record of
1575just one single empty field. However, if C<keep_meta_info> is set, invoking
1576C<is_missing> with index C<0> will now return true.
1577
1578=head2 status
1579
1580 $status = $csv->status ();
1581
1582This method returns the status of the last invoked L</combine> or L</parse>
1583call. Status is success (true: C<1>) or failure (false: C<undef> or C<0>).
1584
1585=head2 error_input
1586
1587 $bad_argument = $csv->error_input ();
1588
1589This method returns the erroneous argument (if it exists) of L</combine> or
1590L</parse>, whichever was called more recently. If the last invocation was
1591successful, C<error_input> will return C<undef>.
1592
1593=head2 error_diag
1594
1595 Text::CSV->error_diag ();
1596 $csv->error_diag ();
1597 $error_code = 0 + $csv->error_diag ();
1598 $error_str = "" . $csv->error_diag ();
1599 ($cde, $str, $pos, $rec, $fld) = $csv->error_diag ();
1600
1601If (and only if) an error occurred, this function returns the diagnostics
1602of that error.
1603
1604If called in void context, this will print the internal error code and the
1605associated error message to STDERR.
1606
1607If called in list context, this will return the error code and the error
1608message in that order. If the last error was from parsing, the rest of the
1609values returned are a best guess at the location within the line that was
1610being parsed. Their values are 1-based. The position currently is index of
1611the byte at which the parsing failed in the current record. It might change
1612to be the index of the current character in a later release. The records is
1613the index of the record parsed by the csv instance. The field number is the
1614index of the field the parser thinks it is currently trying to parse. See
1615F<examples/csv-check> for how this can be used.
1616
1617If called in scalar context, it will return the diagnostics in a single
1618scalar, a-la C<$!>. It will contain the error code in numeric context, and
1619the diagnostics message in string context.
1620
1621When called as a class method or a direct function call, the diagnostics
1622are that of the last L</new> call.
1623
1624=head2 record_number
1625
1626 $recno = $csv->record_number ();
1627
1628Returns the records parsed by this csv instance. This value should be more
1629accurate than C<$.> when embedded newlines come in play. Records written by
1630this instance are not counted.
1631
1632=head2 SetDiag
1633
1634 $csv->SetDiag (0);
1635
1636Use to reset the diagnostics if you are dealing with errors.
1637
1638=head1 ADDITIONAL METHODS
1639
1640=over
1641
1642=item backend
1643
1644Returns the backend module name called by Text::CSV.
1645C<module> is an alias.
1646
1647=item is_xs
1648
1649Returns true value if Text::CSV uses an XS backend.
1650
1651=item is_pp
1652
1653Returns true value if Text::CSV uses a pure-Perl backend.
1654
1655=back
1656
1657=head1 FUNCTIONS
1658
1659This section is also taken from Text::CSV_XS.
1660
1661=head2 csv
1662
1663This function is not exported by default and should be explicitly requested:
1664
1665 use Text::CSV qw( csv );
1666
1667This is an high-level function that aims at simple (user) interfaces. This
1668can be used to read/parse a C<CSV> file or stream (the default behavior) or
1669to produce a file or write to a stream (define the C<out> attribute). It
1670returns an array- or hash-reference on parsing (or C<undef> on fail) or the
1671numeric value of L</error_diag> on writing. When this function fails you
1672can get to the error using the class call to L</error_diag>
1673
1674 my $aoa = csv (in => "test.csv") or
1675 die Text::CSV->error_diag;
1676
1677This function takes the arguments as key-value pairs. This can be passed as
1678a list or as an anonymous hash:
1679
1680 my $aoa = csv ( in => "test.csv", sep_char => ";");
1681 my $aoh = csv ({ in => $fh, headers => "auto" });
1682
1683The arguments passed consist of two parts: the arguments to L</csv> itself
1684and the optional attributes to the C<CSV> object used inside the function
1685as enumerated and explained in L</new>.
1686
1687If not overridden, the default option used for CSV is
1688
1689 auto_diag => 1
1690 escape_null => 0
1691
1692The option that is always set and cannot be altered is
1693
1694 binary => 1
1695
1696As this function will likely be used in one-liners, it allows C<quote> to
1697be abbreviated as C<quo>, and C<escape_char> to be abbreviated as C<esc>
1698or C<escape>.
1699
1700Alternative invocations:
1701
1702 my $aoa = Text::CSV::csv (in => "file.csv");
1703
1704 my $csv = Text::CSV->new ();
1705 my $aoa = $csv->csv (in => "file.csv");
1706
1707In the latter case, the object attributes are used from the existing object
1708and the attribute arguments in the function call are ignored:
1709
1710 my $csv = Text::CSV->new ({ sep_char => ";" });
1711 my $aoh = $csv->csv (in => "file.csv", bom => 1);
1712
1713will parse using C<;> as C<sep_char>, not C<,>.
1714
1715=head3 in
1716
1717Used to specify the source. C<in> can be a file name (e.g. C<"file.csv">),
1718which will be opened for reading and closed when finished, a file handle
1719(e.g. C<$fh> or C<FH>), a reference to a glob (e.g. C<\*ARGV>), the glob
1720itself (e.g. C<*STDIN>), or a reference to a scalar (e.g. C<\q{1,2,"csv"}>).
1721
1722When used with L</out>, C<in> should be a reference to a CSV structure (AoA
1723or AoH) or a CODE-ref that returns an array-reference or a hash-reference.
1724The code-ref will be invoked with no arguments.
1725
1726 my $aoa = csv (in => "file.csv");
1727
1728 open my $fh, "<", "file.csv";
1729 my $aoa = csv (in => $fh);
1730
1731 my $csv = [ [qw( Foo Bar )], [ 1, 2 ], [ 2, 3 ]];
1732 my $err = csv (in => $csv, out => "file.csv");
1733
1734If called in void context without the L</out> attribute, the resulting ref
1735will be used as input to a subsequent call to csv:
1736
1737 csv (in => "file.csv", filter => { 2 => sub { length > 2 }})
1738
1739will be a shortcut to
1740
1741 csv (in => csv (in => "file.csv", filter => { 2 => sub { length > 2 }}))
1742
1743where, in the absence of the C<out> attribute, this is a shortcut to
1744
1745 csv (in => csv (in => "file.csv", filter => { 2 => sub { length > 2 }}),
1746 out => *STDOUT)
1747
1748=head3 out
1749
1750 csv (in => $aoa, out => "file.csv");
1751 csv (in => $aoa, out => $fh);
1752 csv (in => $aoa, out => STDOUT);
1753 csv (in => $aoa, out => *STDOUT);
1754 csv (in => $aoa, out => \*STDOUT);
1755 csv (in => $aoa, out => \my $data);
1756 csv (in => $aoa, out => undef);
1757 csv (in => $aoa, out => \"skip");
1758
1759In output mode, the default CSV options when producing CSV are
1760
1761 eol => "\r\n"
1762
1763The L</fragment> attribute is ignored in output mode.
1764
1765C<out> can be a file name (e.g. C<"file.csv">), which will be opened for
1766writing and closed when finished, a file handle (e.g. C<$fh> or C<FH>), a
1767reference to a glob (e.g. C<\*STDOUT>), the glob itself (e.g. C<*STDOUT>),
1768or a reference to a scalar (e.g. C<\my $data>).
1769
1770 csv (in => sub { $sth->fetch }, out => "dump.csv");
1771 csv (in => sub { $sth->fetchrow_hashref }, out => "dump.csv",
1772 headers => $sth->{NAME_lc});
1773
1774When a code-ref is used for C<in>, the output is generated per invocation,
1775so no buffering is involved. This implies that there is no size restriction
1776on the number of records. The C<csv> function ends when the coderef returns
1777a false value.
1778
1779If C<out> is set to a reference of the literal string C<"skip">, the output
1780will be suppressed completely, which might be useful in combination with a
1781filter for side effects only.
1782
1783 my %cache;
1784 csv (in => "dump.csv",
1785 out => \"skip",
1786 on_in => sub { $cache{$_[1][1]}++ });
1787
1788Currently, setting C<out> to any false value (C<undef>, C<"">, 0) will be
1789equivalent to C<\"skip">.
1790
1791=head3 encoding
1792
1793If passed, it should be an encoding accepted by the C<:encoding()> option
1794to C<open>. There is no default value. This attribute does not work in perl
17955.6.x. C<encoding> can be abbreviated to C<enc> for ease of use in command
1796line invocations.
1797
1798If C<encoding> is set to the literal value C<"auto">, the method L</header>
1799will be invoked on the opened stream to check if there is a BOM and set the
1800encoding accordingly. This is equal to passing a true value in the option
1801L<C<detect_bom>|/detect_bom>.
1802
1803=head3 detect_bom
1804
1805If C<detect_bom> is given, the method L</header> will be invoked on the
1806opened stream to check if there is a BOM and set the encoding accordingly.
1807
1808C<detect_bom> can be abbreviated to C<bom>.
1809
1810This is the same as setting L<C<encoding>|/encoding> to C<"auto">.
1811
1812Note that as the method L</header> is invoked, its default is to also set
1813the headers.
1814
1815=head3 headers
1816
1817If this attribute is not given, the default behavior is to produce an array
1818of arrays.
1819
1820If C<headers> is supplied, it should be an anonymous list of column names,
1821an anonymous hashref, a coderef, or a literal flag: C<auto>, C<lc>, C<uc>,
1822or C<skip>.
1823
1824=over 2
1825
1826=item skip
1827
1828When C<skip> is used, the header will not be included in the output.
1829
1830 my $aoa = csv (in => $fh, headers => "skip");
1831
1832=item auto
1833
1834If C<auto> is used, the first line of the C<CSV> source will be read as the
1835list of field headers and used to produce an array of hashes.
1836
1837 my $aoh = csv (in => $fh, headers => "auto");
1838
1839=item lc
1840
1841If C<lc> is used, the first line of the C<CSV> source will be read as the
1842list of field headers mapped to lower case and used to produce an array of
1843hashes. This is a variation of C<auto>.
1844
1845 my $aoh = csv (in => $fh, headers => "lc");
1846
1847=item uc
1848
1849If C<uc> is used, the first line of the C<CSV> source will be read as the
1850list of field headers mapped to upper case and used to produce an array of
1851hashes. This is a variation of C<auto>.
1852
1853 my $aoh = csv (in => $fh, headers => "uc");
1854
1855=item CODE
1856
1857If a coderef is used, the first line of the C<CSV> source will be read as
1858the list of mangled field headers in which each field is passed as the only
1859argument to the coderef. This list is used to produce an array of hashes.
1860
1861 my $aoh = csv (in => $fh,
1862 headers => sub { lc ($_[0]) =~ s/kode/code/gr });
1863
1864this example is a variation of using C<lc> where all occurrences of C<kode>
1865are replaced with C<code>.
1866
1867=item ARRAY
1868
1869If C<headers> is an anonymous list, the entries in the list will be used
1870as field names. The first line is considered data instead of headers.
1871
1872 my $aoh = csv (in => $fh, headers => [qw( Foo Bar )]);
1873 csv (in => $aoa, out => $fh, headers => [qw( code description price )]);
1874
1875=item HASH
1876
1877If C<headers> is an hash reference, this implies C<auto>, but header fields
1878for that exist as key in the hashref will be replaced by the value for that
1879key. Given a CSV file like
1880
1881 post-kode,city,name,id number,fubble
1882 1234AA,Duckstad,Donald,13,"X313DF"
1883
1884using
1885
1886 csv (headers => { "post-kode" => "pc", "id number" => "ID" }, ...
1887
1888will return an entry like
1889
1890 { pc => "1234AA",
1891 city => "Duckstad",
1892 name => "Donald",
1893 ID => "13",
1894 fubble => "X313DF",
1895 }
1896
1897=back
1898
1899See also L<C<munge_column_names>|/munge_column_names> and
1900L<C<set_column_names>|/set_column_names>.
1901
1902=head3 munge_column_names
1903
1904If C<munge_column_names> is set, the method L</header> is invoked on the
1905opened stream with all matching arguments to detect and set the headers.
1906
1907C<munge_column_names> can be abbreviated to C<munge>.
1908
1909=head3 key
1910
1911If passed, will default L<C<headers>|/headers> to C<"auto"> and return a
1912hashref instead of an array of hashes. Allowed values are simple scalars or
1913array-references where the first element is the joiner and the rest are the
1914fields to join to combine the key.
1915
1916 my $ref = csv (in => "test.csv", key => "code");
1917 my $ref = csv (in => "test.csv", key => [ ":" => "code", "color" ]);
1918
1919with test.csv like
1920
1921 code,product,price,color
1922 1,pc,850,gray
1923 2,keyboard,12,white
1924 3,mouse,5,black
1925
1926the first example will return
1927
1928 { 1 => {
1929 code => 1,
1930 color => 'gray',
1931 price => 850,
1932 product => 'pc'
1933 },
1934 2 => {
1935 code => 2,
1936 color => 'white',
1937 price => 12,
1938 product => 'keyboard'
1939 },
1940 3 => {
1941 code => 3,
1942 color => 'black',
1943 price => 5,
1944 product => 'mouse'
1945 }
1946 }
1947
1948the second example will return
1949
1950 { "1:gray" => {
1951 code => 1,
1952 color => 'gray',
1953 price => 850,
1954 product => 'pc'
1955 },
1956 "2:white" => {
1957 code => 2,
1958 color => 'white',
1959 price => 12,
1960 product => 'keyboard'
1961 },
1962 "3:black" => {
1963 code => 3,
1964 color => 'black',
1965 price => 5,
1966 product => 'mouse'
1967 }
1968 }
1969
1970The C<key> attribute can be combined with L<C<headers>|/headers> for C<CSV>
1971date that has no header line, like
1972
1973 my $ref = csv (
1974 in => "foo.csv",
1975 headers => [qw( c_foo foo bar description stock )],
1976 key => "c_foo",
1977 );
1978
1979=head3 value
1980
1981Used to create key-value hashes.
1982
1983Only allowed when C<key> is valid. A C<value> can be either a single column
1984label or an anonymous list of column labels. In the first case, the value
1985will be a simple scalar value, in the latter case, it will be a hashref.
1986
1987 my $ref = csv (in => "test.csv", key => "code",
1988 value => "price");
1989 my $ref = csv (in => "test.csv", key => "code",
1990 value => [ "product", "price" ]);
1991 my $ref = csv (in => "test.csv", key => [ ":" => "code", "color" ],
1992 value => "price");
1993 my $ref = csv (in => "test.csv", key => [ ":" => "code", "color" ],
1994 value => [ "product", "price" ]);
1995
1996with test.csv like
1997
1998 code,product,price,color
1999 1,pc,850,gray
2000 2,keyboard,12,white
2001 3,mouse,5,black
2002
2003the first example will return
2004
2005 { 1 => 850,
2006 2 => 12,
2007 3 => 5,
2008 }
2009
2010the second example will return
2011
2012 { 1 => {
2013 price => 850,
2014 product => 'pc'
2015 },
2016 2 => {
2017 price => 12,
2018 product => 'keyboard'
2019 },
2020 3 => {
2021 price => 5,
2022 product => 'mouse'
2023 }
2024 }
2025
2026the third example will return
2027
2028 { "1:gray" => 850,
2029 "2:white" => 12,
2030 "3:black" => 5,
2031 }
2032
2033the fourth example will return
2034
2035 { "1:gray" => {
2036 price => 850,
2037 product => 'pc'
2038 },
2039 "2:white" => {
2040 price => 12,
2041 product => 'keyboard'
2042 },
2043 "3:black" => {
2044 price => 5,
2045 product => 'mouse'
2046 }
2047 }
2048
2049=head3 keep_headers
2050
2051When using hashes, keep the column names into the arrayref passed, so all
2052headers are available after the call in the original order.
2053
2054 my $aoh = csv (in => "file.csv", keep_headers => \my @hdr);
2055
2056This attribute can be abbreviated to C<kh> or passed as C<keep_column_names>.
2057
2058This attribute implies a default of C<auto> for the C<headers> attribute.
2059
2060=head3 fragment
2061
2062Only output the fragment as defined in the L</fragment> method. This option
2063is ignored when I<generating> C<CSV>. See L</out>.
2064
2065Combining all of them could give something like
2066
2067 use Text::CSV qw( csv );
2068 my $aoh = csv (
2069 in => "test.txt",
2070 encoding => "utf-8",
2071 headers => "auto",
2072 sep_char => "|",
2073 fragment => "row=3;6-9;15-*",
2074 );
2075 say $aoh->[15]{Foo};
2076
2077=head3 sep_set
2078
2079If C<sep_set> is set, the method L</header> is invoked on the opened stream
2080to detect and set L<C<sep_char>|/sep_char> with the given set.
2081
2082C<sep_set> can be abbreviated to C<seps>.
2083
2084Note that as the L</header> method is invoked, its default is to also set
2085the headers.
2086
2087=head3 set_column_names
2088
2089If C<set_column_names> is passed, the method L</header> is invoked on the
2090opened stream with all arguments meant for L</header>.
2091
2092If C<set_column_names> is passed as a false value, the content of the first
2093row is only preserved if the output is AoA:
2094
2095With an input-file like
2096
2097 bAr,foo
2098 1,2
2099 3,4,5
2100
2101This call
2102
2103 my $aoa = csv (in => $file, set_column_names => 0);
2104
2105will result in
2106
2107 [[ "bar", "foo" ],
2108 [ "1", "2" ],
2109 [ "3", "4", "5" ]]
2110
2111and
2112
2113 my $aoa = csv (in => $file, set_column_names => 0, munge => "none");
2114
2115will result in
2116
2117 [[ "bAr", "foo" ],
2118 [ "1", "2" ],
2119 [ "3", "4", "5" ]]
2120
2121=head2 Callbacks
2122
2123Callbacks enable actions triggered from the I<inside> of Text::CSV.
2124
2125While most of what this enables can easily be done in an unrolled loop as
2126described in the L</SYNOPSIS> callbacks can be used to meet special demands
2127or enhance the L</csv> function.
2128
2129=over 2
2130
2131=item error
2132
2133 $csv->callbacks (error => sub { $csv->SetDiag (0) });
2134
2135the C<error> callback is invoked when an error occurs, but I<only> when
2136L</auto_diag> is set to a true value. A callback is invoked with the values
2137returned by L</error_diag>:
2138
2139 my ($c, $s);
2140
2141 sub ignore3006
2142 {
2143 my ($err, $msg, $pos, $recno, $fldno) = @_;
2144 if ($err == 3006) {
2145 # ignore this error
2146 ($c, $s) = (undef, undef);
2147 Text::CSV->SetDiag (0);
2148 }
2149 # Any other error
2150 return;
2151 } # ignore3006
2152
2153 $csv->callbacks (error => \&ignore3006);
2154 $csv->bind_columns (\$c, \$s);
2155 while ($csv->getline ($fh)) {
2156 # Error 3006 will not stop the loop
2157 }
2158
2159=item after_parse
2160
2161 $csv->callbacks (after_parse => sub { push @{$_[1]}, "NEW" });
2162 while (my $row = $csv->getline ($fh)) {
2163 $row->[-1] eq "NEW";
2164 }
2165
2166This callback is invoked after parsing with L</getline> only if no error
2167occurred. The callback is invoked with two arguments: the current C<CSV>
2168parser object and an array reference to the fields parsed.
2169
2170The return code of the callback is ignored unless it is a reference to the
2171string "skip", in which case the record will be skipped in L</getline_all>.
2172
2173 sub add_from_db
2174 {
2175 my ($csv, $row) = @_;
2176 $sth->execute ($row->[4]);
2177 push @$row, $sth->fetchrow_array;
2178 } # add_from_db
2179
2180 my $aoa = csv (in => "file.csv", callbacks => {
2181 after_parse => \&add_from_db });
2182
2183This hook can be used for validation:
2184
2185=over 2
2186
2187=item FAIL
2188
2189Die if any of the records does not validate a rule:
2190
2191 after_parse => sub {
2192 $_[1][4] =~ m/^[0-9]{4}\s?[A-Z]{2}$/ or
2193 die "5th field does not have a valid Dutch zipcode";
2194 }
2195
2196=item DEFAULT
2197
2198Replace invalid fields with a default value:
2199
2200 after_parse => sub { $_[1][2] =~ m/^\d+$/ or $_[1][2] = 0 }
2201
2202=item SKIP
2203
2204Skip records that have invalid fields (only applies to L</getline_all>):
2205
2206 after_parse => sub { $_[1][0] =~ m/^\d+$/ or return \"skip"; }
2207
2208=back
2209
2210=item before_print
2211
2212 my $idx = 1;
2213 $csv->callbacks (before_print => sub { $_[1][0] = $idx++ });
2214 $csv->print (*STDOUT, [ 0, $_ ]) for @members;
2215
2216This callback is invoked before printing with L</print> only if no error
2217occurred. The callback is invoked with two arguments: the current C<CSV>
2218parser object and an array reference to the fields passed.
2219
2220The return code of the callback is ignored.
2221
2222 sub max_4_fields
2223 {
2224 my ($csv, $row) = @_;
2225 @$row > 4 and splice @$row, 4;
2226 } # max_4_fields
2227
2228 csv (in => csv (in => "file.csv"), out => *STDOUT,
2229 callbacks => { before print => \&max_4_fields });
2230
2231This callback is not active for L</combine>.
2232
2233=back
2234
2235=head3 Callbacks for csv ()
2236
2237The L</csv> allows for some callbacks that do not integrate in XS internals
2238but only feature the L</csv> function.
2239
2240 csv (in => "file.csv",
2241 callbacks => {
2242 filter => { 6 => sub { $_ > 15 } }, # first
2243 after_parse => sub { say "AFTER PARSE"; }, # first
2244 after_in => sub { say "AFTER IN"; }, # second
2245 on_in => sub { say "ON IN"; }, # third
2246 },
2247 );
2248
2249 csv (in => $aoh,
2250 out => "file.csv",
2251 callbacks => {
2252 on_in => sub { say "ON IN"; }, # first
2253 before_out => sub { say "BEFORE OUT"; }, # second
2254 before_print => sub { say "BEFORE PRINT"; }, # third
2255 },
2256 );
2257
2258=over 2
2259
2260=item filter
2261
2262This callback can be used to filter records. It is called just after a new
2263record has been scanned. The callback accepts a:
2264
2265=over 2
2266
2267=item hashref
2268
2269The keys are the index to the row (the field name or field number, 1-based)
2270and the values are subs to return a true or false value.
2271
2272 csv (in => "file.csv", filter => {
2273 3 => sub { m/a/ }, # third field should contain an "a"
2274 5 => sub { length > 4 }, # length of the 5th field minimal 5
2275 });
2276
2277 csv (in => "file.csv", filter => { foo => sub { $_ > 4 }});
2278
2279If the keys to the filter hash contain any character that is not a digit it
2280will also implicitly set L</headers> to C<"auto"> unless L</headers> was
2281already passed as argument. When headers are active, returning an array of
2282hashes, the filter is not applicable to the header itself.
2283
2284All sub results should match, as in AND.
2285
2286The context of the callback sets C<$_> localized to the field indicated by
2287the filter. The two arguments are as with all other callbacks, so the other
2288fields in the current row can be seen:
2289
2290 filter => { 3 => sub { $_ > 100 ? $_[1][1] =~ m/A/ : $_[1][6] =~ m/B/ }}
2291
2292If the context is set to return a list of hashes (L</headers> is defined),
2293the current record will also be available in the localized C<%_>:
2294
2295 filter => { 3 => sub { $_ > 100 && $_{foo} =~ m/A/ && $_{bar} < 1000 }}
2296
2297If the filter is used to I<alter> the content by changing C<$_>, make sure
2298that the sub returns true in order not to have that record skipped:
2299
2300 filter => { 2 => sub { $_ = uc }}
2301
2302will upper-case the second field, and then skip it if the resulting content
2303evaluates to false. To always accept, end with truth:
2304
2305 filter => { 2 => sub { $_ = uc; 1 }}
2306
2307=item coderef
2308
2309 csv (in => "file.csv", filter => sub { $n++; 0; });
2310
2311If the argument to C<filter> is a coderef, it is an alias or shortcut to a
2312filter on column 0:
2313
2314 csv (filter => sub { $n++; 0 });
2315
2316is equal to
2317
2318 csv (filter => { 0 => sub { $n++; 0 });
2319
2320=item filter-name
2321
2322 csv (in => "file.csv", filter => "not_blank");
2323 csv (in => "file.csv", filter => "not_empty");
2324 csv (in => "file.csv", filter => "filled");
2325
2326These are predefined filters
2327
2328Given a file like (line numbers prefixed for doc purpose only):
2329
2330 1:1,2,3
2331 2:
2332 3:,
2333 4:""
2334 5:,,
2335 6:, ,
2336 7:"",
2337 8:" "
2338 9:4,5,6
2339
2340=over 2
2341
2342=item not_blank
2343
2344Filter out the blank lines
2345
2346This filter is a shortcut for
2347
2348 filter => { 0 => sub { @{$_[1]} > 1 or
2349 defined $_[1][0] && $_[1][0] ne "" } }
2350
2351Due to the implementation, it is currently impossible to also filter lines
2352that consists only of a quoted empty field. These lines are also considered
2353blank lines.
2354
2355With the given example, lines 2 and 4 will be skipped.
2356
2357=item not_empty
2358
2359Filter out lines where all the fields are empty.
2360
2361This filter is a shortcut for
2362
2363 filter => { 0 => sub { grep { defined && $_ ne "" } @{$_[1]} } }
2364
2365A space is not regarded being empty, so given the example data, lines 2, 3,
23664, 5, and 7 are skipped.
2367
2368=item filled
2369
2370Filter out lines that have no visible data
2371
2372This filter is a shortcut for
2373
2374 filter => { 0 => sub { grep { defined && m/\S/ } @{$_[1]} } }
2375
2376This filter rejects all lines that I<not> have at least one field that does
2377not evaluate to the empty string.
2378
2379With the given example data, this filter would skip lines 2 through 8.
2380
2381=back
2382
2383=back
2384
2385=item after_in
2386
2387This callback is invoked for each record after all records have been parsed
2388but before returning the reference to the caller. The hook is invoked with
2389two arguments: the current C<CSV> parser object and a reference to the
2390record. The reference can be a reference to a HASH or a reference to an
2391ARRAY as determined by the arguments.
2392
2393This callback can also be passed as an attribute without the C<callbacks>
2394wrapper.
2395
2396=item before_out
2397
2398This callback is invoked for each record before the record is printed. The
2399hook is invoked with two arguments: the current C<CSV> parser object and a
2400reference to the record. The reference can be a reference to a HASH or a
2401reference to an ARRAY as determined by the arguments.
2402
2403This callback can also be passed as an attribute without the C<callbacks>
2404wrapper.
2405
2406This callback makes the row available in C<%_> if the row is a hashref. In
2407this case C<%_> is writable and will change the original row.
2408
2409=item on_in
2410
2411This callback acts exactly as the L</after_in> or the L</before_out> hooks.
2412
2413This callback can also be passed as an attribute without the C<callbacks>
2414wrapper.
2415
2416This callback makes the row available in C<%_> if the row is a hashref. In
2417this case C<%_> is writable and will change the original row. So e.g. with
2418
2419 my $aoh = csv (
2420 in => \"foo\n1\n2\n",
2421 headers => "auto",
2422 on_in => sub { $_{bar} = 2; },
2423 );
2424
2425C<$aoh> will be:
2426
2427 [ { foo => 1,
2428 bar => 2,
2429 }
2430 { foo => 2,
2431 bar => 2,
2432 }
2433 ]
2434
2435=item csv
2436
2437The I<function> L</csv> can also be called as a method or with an existing
2438Text::CSV object. This could help if the function is to be invoked a lot
2439of times and the overhead of creating the object internally over and over
2440again would be prevented by passing an existing instance.
2441
2442 my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
2443
2444 my $aoa = $csv->csv (in => $fh);
2445 my $aoa = csv (in => $fh, csv => $csv);
2446
2447both act the same. Running this 20000 times on a 20 lines CSV file, showed
2448a 53% speedup.
2449
2450=back
2451
2452=head1 DIAGNOSTICS
2453
2454This section is also taken from Text::CSV_XS.
2455
2456Still under construction ...
2457
2458If an error occurs, C<< $csv->error_diag >> can be used to get information
2459on the cause of the failure. Note that for speed reasons the internal value
2460is never cleared on success, so using the value returned by L</error_diag>
2461in normal cases - when no error occurred - may cause unexpected results.
2462
2463If the constructor failed, the cause can be found using L</error_diag> as a
2464class method, like C<< Text::CSV->error_diag >>.
2465
2466The C<< $csv->error_diag >> method is automatically invoked upon error when
2467the contractor was called with L<C<auto_diag>|/auto_diag> set to C<1> or
2468C<2>, or when L<autodie> is in effect. When set to C<1>, this will cause a
2469C<warn> with the error message, when set to C<2>, it will C<die>. C<2012 -
2470EOF> is excluded from L<C<auto_diag>|/auto_diag> reports.
2471
2472Errors can be (individually) caught using the L</error> callback.
2473
2474The errors as described below are available. I have tried to make the error
2475itself explanatory enough, but more descriptions will be added. For most of
2476these errors, the first three capitals describe the error category:
2477
2478=over 2
2479
2480=item *
2481INI
2482
2483Initialization error or option conflict.
2484
2485=item *
2486ECR
2487
2488Carriage-Return related parse error.
2489
2490=item *
2491EOF
2492
2493End-Of-File related parse error.
2494
2495=item *
2496EIQ
2497
2498Parse error inside quotation.
2499
2500=item *
2501EIF
2502
2503Parse error inside field.
2504
2505=item *
2506ECB
2507
2508Combine error.
2509
2510=item *
2511EHR
2512
2513HashRef parse related error.
2514
2515=back
2516
2517And below should be the complete list of error codes that can be returned:
2518
2519=over 2
2520
2521=item *
25221001 "INI - sep_char is equal to quote_char or escape_char"
2523
2524The L<separation character|/sep_char> cannot be equal to L<the quotation
2525character|/quote_char> or to L<the escape character|/escape_char>, as this
2526would invalidate all parsing rules.
2527
2528=item *
25291002 "INI - allow_whitespace with escape_char or quote_char SP or TAB"
2530
2531Using the L<C<allow_whitespace>|/allow_whitespace> attribute when either
2532L<C<quote_char>|/quote_char> or L<C<escape_char>|/escape_char> is equal to
2533C<SPACE> or C<TAB> is too ambiguous to allow.
2534
2535=item *
25361003 "INI - \r or \n in main attr not allowed"
2537
2538Using default L<C<eol>|/eol> characters in either L<C<sep_char>|/sep_char>,
2539L<C<quote_char>|/quote_char>, or L<C<escape_char>|/escape_char> is not
2540allowed.
2541
2542=item *
25431004 "INI - callbacks should be undef or a hashref"
2544
2545The L<C<callbacks>|/Callbacks> attribute only allows one to be C<undef> or
2546a hash reference.
2547
2548=item *
25491005 "INI - EOL too long"
2550
2551The value passed for EOL is exceeding its maximum length (16).
2552
2553=item *
25541006 "INI - SEP too long"
2555
2556The value passed for SEP is exceeding its maximum length (16).
2557
2558=item *
25591007 "INI - QUOTE too long"
2560
2561The value passed for QUOTE is exceeding its maximum length (16).
2562
2563=item *
25641008 "INI - SEP undefined"
2565
2566The value passed for SEP should be defined and not empty.
2567
2568=item *
25691010 "INI - the header is empty"
2570
2571The header line parsed in the L</header> is empty.
2572
2573=item *
25741011 "INI - the header contains more than one valid separator"
2575
2576The header line parsed in the L</header> contains more than one (unique)
2577separator character out of the allowed set of separators.
2578
2579=item *
25801012 "INI - the header contains an empty field"
2581
2582The header line parsed in the L</header> is contains an empty field.
2583
2584=item *
25851013 "INI - the header contains nun-unique fields"
2586
2587The header line parsed in the L</header> contains at least two identical
2588fields.
2589
2590=item *
25911014 "INI - header called on undefined stream"
2592
2593The header line cannot be parsed from an undefined sources.
2594
2595=item *
25961500 "PRM - Invalid/unsupported argument(s)"
2597
2598Function or method called with invalid argument(s) or parameter(s).
2599
2600=item *
26011501 "PRM - The key attribute is passed as an unsupported type"
2602
2603The C<key> attribute is of an unsupported type.
2604
2605=item *
26061502 "PRM - The value attribute is passed without the key attribute"
2607
2608The C<value> attribute is only allowed when a valid key is given.
2609
2610=item *
26111503 "PRM - The value attribute is passed as an unsupported type"
2612
2613The C<value> attribute is of an unsupported type.
2614
2615=item *
26162010 "ECR - QUO char inside quotes followed by CR not part of EOL"
2617
2618When L<C<eol>|/eol> has been set to anything but the default, like
2619C<"\r\t\n">, and the C<"\r"> is following the B<second> (closing)
2620L<C<quote_char>|/quote_char>, where the characters following the C<"\r"> do
2621not make up the L<C<eol>|/eol> sequence, this is an error.
2622
2623=item *
26242011 "ECR - Characters after end of quoted field"
2625
2626Sequences like C<1,foo,"bar"baz,22,1> are not allowed. C<"bar"> is a quoted
2627field and after the closing double-quote, there should be either a new-line
2628sequence or a separation character.
2629
2630=item *
26312012 "EOF - End of data in parsing input stream"
2632
2633Self-explaining. End-of-file while inside parsing a stream. Can happen only
2634when reading from streams with L</getline>, as using L</parse> is done on
2635strings that are not required to have a trailing L<C<eol>|/eol>.
2636
2637=item *
26382013 "INI - Specification error for fragments RFC7111"
2639
2640Invalid specification for URI L</fragment> specification.
2641
2642=item *
26432014 "ENF - Inconsistent number of fields"
2644
2645Inconsistent number of fields under strict parsing.
2646
2647=item *
26482021 "EIQ - NL char inside quotes, binary off"
2649
2650Sequences like C<1,"foo\nbar",22,1> are allowed only when the binary option
2651has been selected with the constructor.
2652
2653=item *
26542022 "EIQ - CR char inside quotes, binary off"
2655
2656Sequences like C<1,"foo\rbar",22,1> are allowed only when the binary option
2657has been selected with the constructor.
2658
2659=item *
26602023 "EIQ - QUO character not allowed"
2661
2662Sequences like C<"foo "bar" baz",qu> and C<2023,",2008-04-05,"Foo, Bar",\n>
2663will cause this error.
2664
2665=item *
26662024 "EIQ - EOF cannot be escaped, not even inside quotes"
2667
2668The escape character is not allowed as last character in an input stream.
2669
2670=item *
26712025 "EIQ - Loose unescaped escape"
2672
2673An escape character should escape only characters that need escaping.
2674
2675Allowing the escape for other characters is possible with the attribute
2676L</allow_loose_escape>.
2677
2678=item *
26792026 "EIQ - Binary character inside quoted field, binary off"
2680
2681Binary characters are not allowed by default. Exceptions are fields that
2682contain valid UTF-8, that will automatically be upgraded if the content is
2683valid UTF-8. Set L<C<binary>|/binary> to C<1> to accept binary data.
2684
2685=item *
26862027 "EIQ - Quoted field not terminated"
2687
2688When parsing a field that started with a quotation character, the field is
2689expected to be closed with a quotation character. When the parsed line is
2690exhausted before the quote is found, that field is not terminated.
2691
2692=item *
26932030 "EIF - NL char inside unquoted verbatim, binary off"
2694
2695=item *
26962031 "EIF - CR char is first char of field, not part of EOL"
2697
2698=item *
26992032 "EIF - CR char inside unquoted, not part of EOL"
2700
2701=item *
27022034 "EIF - Loose unescaped quote"
2703
2704=item *
27052035 "EIF - Escaped EOF in unquoted field"
2706
2707=item *
27082036 "EIF - ESC error"
2709
2710=item *
27112037 "EIF - Binary character in unquoted field, binary off"
2712
2713=item *
27142110 "ECB - Binary character in Combine, binary off"
2715
2716=item *
27172200 "EIO - print to IO failed. See errno"
2718
2719=item *
27203001 "EHR - Unsupported syntax for column_names ()"
2721
2722=item *
27233002 "EHR - getline_hr () called before column_names ()"
2724
2725=item *
27263003 "EHR - bind_columns () and column_names () fields count mismatch"
2727
2728=item *
27293004 "EHR - bind_columns () only accepts refs to scalars"
2730
2731=item *
27323006 "EHR - bind_columns () did not pass enough refs for parsed fields"
2733
2734=item *
27353007 "EHR - bind_columns needs refs to writable scalars"
2736
2737=item *
27383008 "EHR - unexpected error in bound fields"
2739
2740=item *
27413009 "EHR - print_hr () called before column_names ()"
2742
2743=item *
27443010 "EHR - print_hr () called with invalid arguments"
2745
2746=back
2747
2748=head1 SEE ALSO
2749
2750L<Text::CSV_PP>, L<Text::CSV_XS> and L<Text::CSV::Encoded>.
2751
2752
2753=head1 AUTHORS and MAINTAINERS
2754
2755Alan Citterman F<E<lt>alan[at]mfgrtl.comE<gt>> wrote the original Perl
2756module. Please don't send mail concerning Text::CSV to Alan, as
2757he's not a present maintainer.
2758
2759Jochen Wiedmann F<E<lt>joe[at]ispsoft.deE<gt>> rewrote the encoding and
2760decoding in C by implementing a simple finite-state machine and added
2761the variable quote, escape and separator characters, the binary mode
2762and the print and getline methods. See ChangeLog releases 0.10 through
27630.23.
2764
2765H.Merijn Brand F<E<lt>h.m.brand[at]xs4all.nlE<gt>> cleaned up the code,
2766added the field flags methods, wrote the major part of the test suite,
2767completed the documentation, fixed some RT bugs. See ChangeLog releases
27680.25 and on.
2769
2770Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt> wrote Text::CSV_PP
2771which is the pure-Perl version of Text::CSV_XS.
2772
2773New Text::CSV (since 0.99) is maintained by Makamaka, and Kenichi Ishigaki
2774since 1.91.
2775
2776
2777=head1 COPYRIGHT AND LICENSE
2778
2779Text::CSV
2780
2781Copyright (C) 1997 Alan Citterman. All rights reserved.
2782Copyright (C) 2007-2015 Makamaka Hannyaharamitu.
2783Copyright (C) 2017- Kenichi Ishigaki
2784A large portion of the doc is taken from Text::CSV_XS. See below.
2785
2786Text::CSV_PP:
2787
2788Copyright (C) 2005-2015 Makamaka Hannyaharamitu.
2789Copyright (C) 2017- Kenichi Ishigaki
2790A large portion of the code/doc are also taken from Text::CSV_XS. See below.
2791
2792Text:CSV_XS:
2793
2794Copyright (C) 2007-2016 H.Merijn Brand for PROCURA B.V.
2795Copyright (C) 1998-2001 Jochen Wiedmann. All rights reserved.
2796Portions Copyright (C) 1997 Alan Citterman. All rights reserved.
2797
2798
2799This library is free software; you can redistribute it and/or modify
2800it under the same terms as Perl itself.
2801
2802=cut
Note: See TracBrowser for help on using the repository browser.