[33235] | 1 | package Text::CSV_PP;
|
---|
| 2 |
|
---|
| 3 | ################################################################################
|
---|
| 4 | #
|
---|
| 5 | # Text::CSV_PP - Text::CSV_XS compatible pure-Perl module
|
---|
| 6 | #
|
---|
| 7 | ################################################################################
|
---|
| 8 | require 5.006001;
|
---|
| 9 |
|
---|
| 10 | use strict;
|
---|
| 11 | use Exporter ();
|
---|
| 12 | use vars qw($VERSION @ISA @EXPORT_OK);
|
---|
| 13 | use Carp;
|
---|
| 14 |
|
---|
| 15 | $VERSION = '1.99';
|
---|
| 16 | @ISA = qw(Exporter);
|
---|
| 17 | @EXPORT_OK = qw(csv);
|
---|
| 18 |
|
---|
| 19 | sub PV { 0 }
|
---|
| 20 | sub IV { 1 }
|
---|
| 21 | sub NV { 2 }
|
---|
| 22 |
|
---|
| 23 | sub IS_QUOTED () { 0x0001; }
|
---|
| 24 | sub IS_BINARY () { 0x0002; }
|
---|
| 25 | sub IS_ERROR () { 0x0004; }
|
---|
| 26 | sub IS_MISSING () { 0x0010; }
|
---|
| 27 |
|
---|
| 28 | sub HOOK_ERROR () { 0x0001; }
|
---|
| 29 | sub HOOK_AFTER_PARSE () { 0x0002; }
|
---|
| 30 | sub HOOK_BEFORE_PRINT () { 0x0004; }
|
---|
| 31 |
|
---|
| 32 | sub useIO_EOF () { 0x0010; }
|
---|
| 33 |
|
---|
| 34 | my $ERRORS = {
|
---|
| 35 | # Generic errors
|
---|
| 36 | 1000 => "INI - constructor failed",
|
---|
| 37 | 1001 => "INI - sep_char is equal to quote_char or escape_char",
|
---|
| 38 | 1002 => "INI - allow_whitespace with escape_char or quote_char SP or TAB",
|
---|
| 39 | 1003 => "INI - \\r or \\n in main attr not allowed",
|
---|
| 40 | 1004 => "INI - callbacks should be undef or a hashref",
|
---|
| 41 | 1005 => "INI - EOL too long",
|
---|
| 42 | 1006 => "INI - SEP too long",
|
---|
| 43 | 1007 => "INI - QUOTE too long",
|
---|
| 44 | 1008 => "INI - SEP undefined",
|
---|
| 45 |
|
---|
| 46 | 1010 => "INI - the header is empty",
|
---|
| 47 | 1011 => "INI - the header contains more than one valid separator",
|
---|
| 48 | 1012 => "INI - the header contains an empty field",
|
---|
| 49 | 1013 => "INI - the header contains nun-unique fields",
|
---|
| 50 | 1014 => "INI - header called on undefined stream",
|
---|
| 51 |
|
---|
| 52 | # Syntax errors
|
---|
| 53 | 1500 => "PRM - Invalid/unsupported arguments(s)",
|
---|
| 54 | 1501 => "PRM - The key attribute is passed as an unsupported type",
|
---|
| 55 | 1502 => "PRM - The value attribute is passed without the key attribute",
|
---|
| 56 | 1503 => "PRM - The value attribute is passed as an unsupported type",
|
---|
| 57 |
|
---|
| 58 | # Parse errors
|
---|
| 59 | 2010 => "ECR - QUO char inside quotes followed by CR not part of EOL",
|
---|
| 60 | 2011 => "ECR - Characters after end of quoted field",
|
---|
| 61 | 2012 => "EOF - End of data in parsing input stream",
|
---|
| 62 | 2013 => "ESP - Specification error for fragments RFC7111",
|
---|
| 63 | 2014 => "ENF - Inconsistent number of fields",
|
---|
| 64 |
|
---|
| 65 | # EIQ - Error Inside Quotes
|
---|
| 66 | 2021 => "EIQ - NL char inside quotes, binary off",
|
---|
| 67 | 2022 => "EIQ - CR char inside quotes, binary off",
|
---|
| 68 | 2023 => "EIQ - QUO character not allowed",
|
---|
| 69 | 2024 => "EIQ - EOF cannot be escaped, not even inside quotes",
|
---|
| 70 | 2025 => "EIQ - Loose unescaped escape",
|
---|
| 71 | 2026 => "EIQ - Binary character inside quoted field, binary off",
|
---|
| 72 | 2027 => "EIQ - Quoted field not terminated",
|
---|
| 73 |
|
---|
| 74 | # EIF - Error Inside Field
|
---|
| 75 | 2030 => "EIF - NL char inside unquoted verbatim, binary off",
|
---|
| 76 | 2031 => "EIF - CR char is first char of field, not part of EOL",
|
---|
| 77 | 2032 => "EIF - CR char inside unquoted, not part of EOL",
|
---|
| 78 | 2034 => "EIF - Loose unescaped quote",
|
---|
| 79 | 2035 => "EIF - Escaped EOF in unquoted field",
|
---|
| 80 | 2036 => "EIF - ESC error",
|
---|
| 81 | 2037 => "EIF - Binary character in unquoted field, binary off",
|
---|
| 82 |
|
---|
| 83 | # Combine errors
|
---|
| 84 | 2110 => "ECB - Binary character in Combine, binary off",
|
---|
| 85 |
|
---|
| 86 | # IO errors
|
---|
| 87 | 2200 => "EIO - print to IO failed. See errno",
|
---|
| 88 |
|
---|
| 89 | # Hash-Ref errors
|
---|
| 90 | 3001 => "EHR - Unsupported syntax for column_names ()",
|
---|
| 91 | 3002 => "EHR - getline_hr () called before column_names ()",
|
---|
| 92 | 3003 => "EHR - bind_columns () and column_names () fields count mismatch",
|
---|
| 93 | 3004 => "EHR - bind_columns () only accepts refs to scalars",
|
---|
| 94 | 3006 => "EHR - bind_columns () did not pass enough refs for parsed fields",
|
---|
| 95 | 3007 => "EHR - bind_columns needs refs to writable scalars",
|
---|
| 96 | 3008 => "EHR - unexpected error in bound fields",
|
---|
| 97 | 3009 => "EHR - print_hr () called before column_names ()",
|
---|
| 98 | 3010 => "EHR - print_hr () called with invalid arguments",
|
---|
| 99 |
|
---|
| 100 | 4001 => "PRM - The key does not exist as field in the data",
|
---|
| 101 |
|
---|
| 102 | 0 => "",
|
---|
| 103 | };
|
---|
| 104 |
|
---|
| 105 | BEGIN {
|
---|
| 106 | if ( $] < 5.006 ) {
|
---|
| 107 | $INC{'bytes.pm'} = 1 unless $INC{'bytes.pm'}; # dummy
|
---|
| 108 | no strict 'refs';
|
---|
| 109 | *{"utf8::is_utf8"} = sub { 0; };
|
---|
| 110 | *{"utf8::decode"} = sub { };
|
---|
| 111 | }
|
---|
| 112 | elsif ( $] < 5.008 ) {
|
---|
| 113 | no strict 'refs';
|
---|
| 114 | *{"utf8::is_utf8"} = sub { 0; };
|
---|
| 115 | *{"utf8::decode"} = sub { };
|
---|
| 116 | *{"utf8::encode"} = sub { };
|
---|
| 117 | }
|
---|
| 118 | elsif ( !defined &utf8::is_utf8 ) {
|
---|
| 119 | require Encode;
|
---|
| 120 | *utf8::is_utf8 = *Encode::is_utf8;
|
---|
| 121 | }
|
---|
| 122 |
|
---|
| 123 | eval q| require Scalar::Util |;
|
---|
| 124 | if ( $@ ) {
|
---|
| 125 | eval q| require B |;
|
---|
| 126 | if ( $@ ) {
|
---|
| 127 | Carp::croak $@;
|
---|
| 128 | }
|
---|
| 129 | else {
|
---|
| 130 | my %tmap = qw(
|
---|
| 131 | B::NULL SCALAR
|
---|
| 132 | B::HV HASH
|
---|
| 133 | B::AV ARRAY
|
---|
| 134 | B::CV CODE
|
---|
| 135 | B::IO IO
|
---|
| 136 | B::GV GLOB
|
---|
| 137 | B::REGEXP REGEXP
|
---|
| 138 | );
|
---|
| 139 | *Scalar::Util::reftype = sub (\$) {
|
---|
| 140 | my $r = shift;
|
---|
| 141 | return undef unless length(ref($r));
|
---|
| 142 | my $t = ref(B::svref_2object($r));
|
---|
| 143 | return
|
---|
| 144 | exists $tmap{$t} ? $tmap{$t}
|
---|
| 145 | : length(ref($$r)) ? 'REF'
|
---|
| 146 | : 'SCALAR';
|
---|
| 147 | };
|
---|
| 148 | *Scalar::Util::readonly = sub (\$) {
|
---|
| 149 | my $b = B::svref_2object( $_[0] );
|
---|
| 150 | $b->FLAGS & 0x00800000; # SVf_READONLY?
|
---|
| 151 | };
|
---|
| 152 | }
|
---|
| 153 | }
|
---|
| 154 | }
|
---|
| 155 |
|
---|
| 156 | ################################################################################
|
---|
| 157 | #
|
---|
| 158 | # Common pure perl methods, taken almost directly from Text::CSV_XS.
|
---|
| 159 | # (These should be moved into a common class eventually, so that
|
---|
| 160 | # both XS and PP don't need to apply the same changes.)
|
---|
| 161 | #
|
---|
| 162 | ################################################################################
|
---|
| 163 |
|
---|
| 164 | ################################################################################
|
---|
| 165 | # version
|
---|
| 166 | ################################################################################
|
---|
| 167 |
|
---|
| 168 | sub version {
|
---|
| 169 | return $VERSION;
|
---|
| 170 | }
|
---|
| 171 |
|
---|
| 172 | ################################################################################
|
---|
| 173 | # new
|
---|
| 174 | ################################################################################
|
---|
| 175 |
|
---|
| 176 | my %def_attr = (
|
---|
| 177 | eol => '',
|
---|
| 178 | sep_char => ',',
|
---|
| 179 | quote_char => '"',
|
---|
| 180 | escape_char => '"',
|
---|
| 181 | binary => 0,
|
---|
| 182 | decode_utf8 => 1,
|
---|
| 183 | auto_diag => 0,
|
---|
| 184 | diag_verbose => 0,
|
---|
| 185 | strict => 0,
|
---|
| 186 | blank_is_undef => 0,
|
---|
| 187 | empty_is_undef => 0,
|
---|
| 188 | allow_whitespace => 0,
|
---|
| 189 | allow_loose_quotes => 0,
|
---|
| 190 | allow_loose_escapes => 0,
|
---|
| 191 | allow_unquoted_escape => 0,
|
---|
| 192 | always_quote => 0,
|
---|
| 193 | quote_empty => 0,
|
---|
| 194 | quote_space => 1,
|
---|
| 195 | quote_binary => 1,
|
---|
| 196 | escape_null => 1,
|
---|
| 197 | keep_meta_info => 0,
|
---|
| 198 | verbatim => 0,
|
---|
| 199 | formula => 0,
|
---|
| 200 | undef_str => undef,
|
---|
| 201 | types => undef,
|
---|
| 202 | callbacks => undef,
|
---|
| 203 |
|
---|
| 204 | _EOF => 0,
|
---|
| 205 | _RECNO => 0,
|
---|
| 206 | _STATUS => undef,
|
---|
| 207 | _FIELDS => undef,
|
---|
| 208 | _FFLAGS => undef,
|
---|
| 209 | _STRING => undef,
|
---|
| 210 | _ERROR_INPUT => undef,
|
---|
| 211 | _COLUMN_NAMES => undef,
|
---|
| 212 | _BOUND_COLUMNS => undef,
|
---|
| 213 | _AHEAD => undef,
|
---|
| 214 |
|
---|
| 215 | ENCODING => undef,
|
---|
| 216 | );
|
---|
| 217 |
|
---|
| 218 | my %attr_alias = (
|
---|
| 219 | quote_always => "always_quote",
|
---|
| 220 | verbose_diag => "diag_verbose",
|
---|
| 221 | quote_null => "escape_null",
|
---|
| 222 | escape => "escape_char",
|
---|
| 223 | );
|
---|
| 224 |
|
---|
| 225 | my $last_new_error = Text::CSV_PP->SetDiag(0);
|
---|
| 226 | my $last_error;
|
---|
| 227 |
|
---|
| 228 | # NOT a method: is also used before bless
|
---|
| 229 | sub _unhealthy_whitespace {
|
---|
| 230 | my ($self, $aw) = @_;
|
---|
| 231 | $aw or return 0; # no checks needed without allow_whitespace
|
---|
| 232 |
|
---|
| 233 | my $quo = $self->{quote};
|
---|
| 234 | defined $quo && length ($quo) or $quo = $self->{quote_char};
|
---|
| 235 | my $esc = $self->{escape_char};
|
---|
| 236 |
|
---|
| 237 | defined $quo && $quo =~ m/^[ \t]/ and return 1002;
|
---|
| 238 | defined $esc && $esc =~ m/^[ \t]/ and return 1002;
|
---|
| 239 |
|
---|
| 240 | return 0;
|
---|
| 241 | }
|
---|
| 242 |
|
---|
| 243 | sub _check_sanity {
|
---|
| 244 | my $self = shift;
|
---|
| 245 |
|
---|
| 246 | my $eol = $self->{eol};
|
---|
| 247 | my $sep = $self->{sep};
|
---|
| 248 | defined $sep && length ($sep) or $sep = $self->{sep_char};
|
---|
| 249 | my $quo = $self->{quote};
|
---|
| 250 | defined $quo && length ($quo) or $quo = $self->{quote_char};
|
---|
| 251 | my $esc = $self->{escape_char};
|
---|
| 252 |
|
---|
| 253 | # use DP;::diag ("SEP: '", DPeek ($sep),
|
---|
| 254 | # "', QUO: '", DPeek ($quo),
|
---|
| 255 | # "', ESC: '", DPeek ($esc),"'");
|
---|
| 256 |
|
---|
| 257 | # sep_char should not be undefined
|
---|
| 258 | $sep ne "" or return 1008;
|
---|
| 259 | length ($sep) > 16 and return 1006;
|
---|
| 260 | $sep =~ m/[\r\n]/ and return 1003;
|
---|
| 261 |
|
---|
| 262 | if (defined $quo) {
|
---|
| 263 | $quo eq $sep and return 1001;
|
---|
| 264 | length ($quo) > 16 and return 1007;
|
---|
| 265 | $quo =~ m/[\r\n]/ and return 1003;
|
---|
| 266 | }
|
---|
| 267 | if (defined $esc) {
|
---|
| 268 | $esc eq $sep and return 1001;
|
---|
| 269 | $esc =~ m/[\r\n]/ and return 1003;
|
---|
| 270 | }
|
---|
| 271 | if (defined $eol) {
|
---|
| 272 | length ($eol) > 16 and return 1005;
|
---|
| 273 | }
|
---|
| 274 |
|
---|
| 275 | return _unhealthy_whitespace ($self, $self->{allow_whitespace});
|
---|
| 276 | }
|
---|
| 277 |
|
---|
| 278 | sub known_attributes {
|
---|
| 279 | sort grep !m/^_/ => "sep", "quote", keys %def_attr;
|
---|
| 280 | }
|
---|
| 281 |
|
---|
| 282 | sub new {
|
---|
| 283 | $last_new_error = Text::CSV_PP->SetDiag(1000,
|
---|
| 284 | 'usage: my $csv = Text::CSV_PP->new ([{ option => value, ... }]);');
|
---|
| 285 |
|
---|
| 286 | my $proto = shift;
|
---|
| 287 | my $class = ref ($proto) || $proto or return;
|
---|
| 288 | @_ > 0 && ref $_[0] ne "HASH" and return;
|
---|
| 289 | my $attr = shift || {};
|
---|
| 290 | my %attr = map {
|
---|
| 291 | my $k = m/^[a-zA-Z]\w+$/ ? lc $_ : $_;
|
---|
| 292 | exists $attr_alias{$k} and $k = $attr_alias{$k};
|
---|
| 293 | $k => $attr->{$_};
|
---|
| 294 | } keys %$attr;
|
---|
| 295 |
|
---|
| 296 | my $sep_aliased = 0;
|
---|
| 297 | if (exists $attr{sep}) {
|
---|
| 298 | $attr{sep_char} = delete $attr{sep};
|
---|
| 299 | $sep_aliased = 1;
|
---|
| 300 | }
|
---|
| 301 | my $quote_aliased = 0;
|
---|
| 302 | if (exists $attr{quote}) {
|
---|
| 303 | $attr{quote_char} = delete $attr{quote};
|
---|
| 304 | $quote_aliased = 1;
|
---|
| 305 | }
|
---|
| 306 | exists $attr{formula_handling} and
|
---|
| 307 | $attr{formula} = delete $attr{formula_handling};
|
---|
| 308 | exists $attr{formula} and
|
---|
| 309 | $attr{formula} = _supported_formula (undef, $attr{formula});
|
---|
| 310 | for (keys %attr) {
|
---|
| 311 | if (m/^[a-z]/ && exists $def_attr{$_}) {
|
---|
| 312 | # uncoverable condition false
|
---|
| 313 | defined $attr{$_} && m/_char$/ and utf8::decode ($attr{$_});
|
---|
| 314 | next;
|
---|
| 315 | }
|
---|
| 316 | # croak?
|
---|
| 317 | $last_new_error = Text::CSV_PP->SetDiag(1000, "INI - Unknown attribute '$_'");
|
---|
| 318 | $attr{auto_diag} and error_diag ();
|
---|
| 319 | return;
|
---|
| 320 | }
|
---|
| 321 | if ($sep_aliased) {
|
---|
| 322 | my @b = unpack "U0C*", $attr{sep_char};
|
---|
| 323 | if (@b > 1) {
|
---|
| 324 | $attr{sep} = $attr{sep_char};
|
---|
| 325 | $attr{sep_char} = "\0";
|
---|
| 326 | }
|
---|
| 327 | else {
|
---|
| 328 | $attr{sep} = undef;
|
---|
| 329 | }
|
---|
| 330 | }
|
---|
| 331 | if ($quote_aliased and defined $attr{quote_char}) {
|
---|
| 332 | my @b = unpack "U0C*", $attr{quote_char};
|
---|
| 333 | if (@b > 1) {
|
---|
| 334 | $attr{quote} = $attr{quote_char};
|
---|
| 335 | $attr{quote_char} = "\0";
|
---|
| 336 | }
|
---|
| 337 | else {
|
---|
| 338 | $attr{quote} = undef;
|
---|
| 339 | }
|
---|
| 340 | }
|
---|
| 341 |
|
---|
| 342 | my $self = { %def_attr, %attr };
|
---|
| 343 | if (my $ec = _check_sanity ($self)) {
|
---|
| 344 | $last_new_error = Text::CSV_PP->SetDiag($ec);
|
---|
| 345 | $attr{auto_diag} and error_diag ();
|
---|
| 346 | return;
|
---|
| 347 | }
|
---|
| 348 | if (defined $self->{callbacks} && ref $self->{callbacks} ne "HASH") {
|
---|
| 349 | Carp::carp "The 'callbacks' attribute is set but is not a hash: ignored\n";
|
---|
| 350 | $self->{callbacks} = undef;
|
---|
| 351 | }
|
---|
| 352 |
|
---|
| 353 | $last_new_error = Text::CSV_PP->SetDiag(0);
|
---|
| 354 | defined $\ && !exists $attr{eol} and $self->{eol} = $\;
|
---|
| 355 | bless $self, $class;
|
---|
| 356 | defined $self->{types} and $self->types ($self->{types});
|
---|
| 357 | $self;
|
---|
| 358 | }
|
---|
| 359 |
|
---|
| 360 | # Keep in sync with XS!
|
---|
| 361 | my %_cache_id = ( # Only expose what is accessed from within PM
|
---|
| 362 | quote_char => 0,
|
---|
| 363 | escape_char => 1,
|
---|
| 364 | sep_char => 2,
|
---|
| 365 | sep => 39, # 39 .. 55
|
---|
| 366 | binary => 3,
|
---|
| 367 | keep_meta_info => 4,
|
---|
| 368 | always_quote => 5,
|
---|
| 369 | allow_loose_quotes => 6,
|
---|
| 370 | allow_loose_escapes => 7,
|
---|
| 371 | allow_unquoted_escape => 8,
|
---|
| 372 | allow_whitespace => 9,
|
---|
| 373 | blank_is_undef => 10,
|
---|
| 374 | eol => 11,
|
---|
| 375 | quote => 15,
|
---|
| 376 | verbatim => 22,
|
---|
| 377 | empty_is_undef => 23,
|
---|
| 378 | auto_diag => 24,
|
---|
| 379 | diag_verbose => 33,
|
---|
| 380 | quote_space => 25,
|
---|
| 381 | quote_empty => 37,
|
---|
| 382 | quote_binary => 32,
|
---|
| 383 | escape_null => 31,
|
---|
| 384 | decode_utf8 => 35,
|
---|
| 385 | _has_ahead => 30,
|
---|
| 386 | _has_hooks => 36,
|
---|
| 387 | _is_bound => 26, # 26 .. 29
|
---|
| 388 | formula => 38,
|
---|
| 389 | strict => 42,
|
---|
| 390 | undef_str => 46,
|
---|
| 391 | );
|
---|
| 392 |
|
---|
| 393 | my %_hidden_cache_id = qw(
|
---|
| 394 | sep_len 38
|
---|
| 395 | eol_len 12
|
---|
| 396 | eol_is_cr 13
|
---|
| 397 | quo_len 16
|
---|
| 398 | has_error_input 34
|
---|
| 399 | );
|
---|
| 400 |
|
---|
| 401 | my %_reverse_cache_id = (
|
---|
| 402 | map({$_cache_id{$_} => $_} keys %_cache_id),
|
---|
| 403 | map({$_hidden_cache_id{$_} => $_} keys %_hidden_cache_id),
|
---|
| 404 | );
|
---|
| 405 |
|
---|
| 406 | # A `character'
|
---|
| 407 | sub _set_attr_C {
|
---|
| 408 | my ($self, $name, $val, $ec) = @_;
|
---|
| 409 | defined $val or $val = 0;
|
---|
| 410 | utf8::decode ($val);
|
---|
| 411 | $self->{$name} = $val;
|
---|
| 412 | $ec = _check_sanity ($self) and croak ($self->SetDiag ($ec));
|
---|
| 413 | $self->_cache_set ($_cache_id{$name}, $val);
|
---|
| 414 | }
|
---|
| 415 |
|
---|
| 416 | # A flag
|
---|
| 417 | sub _set_attr_X {
|
---|
| 418 | my ($self, $name, $val) = @_;
|
---|
| 419 | defined $val or $val = 0;
|
---|
| 420 | $self->{$name} = $val;
|
---|
| 421 | $self->_cache_set ($_cache_id{$name}, 0 + $val);
|
---|
| 422 | }
|
---|
| 423 |
|
---|
| 424 | # A number
|
---|
| 425 | sub _set_attr_N {
|
---|
| 426 | my ($self, $name, $val) = @_;
|
---|
| 427 | $self->{$name} = $val;
|
---|
| 428 | $self->_cache_set ($_cache_id{$name}, 0 + $val);
|
---|
| 429 | }
|
---|
| 430 |
|
---|
| 431 | # Accessor methods.
|
---|
| 432 | # It is unwise to change them halfway through a single file!
|
---|
| 433 | sub quote_char {
|
---|
| 434 | my $self = shift;
|
---|
| 435 | if (@_) {
|
---|
| 436 | $self->_set_attr_C ("quote_char", shift);
|
---|
| 437 | $self->_cache_set ($_cache_id{quote}, "");
|
---|
| 438 | }
|
---|
| 439 | $self->{quote_char};
|
---|
| 440 | }
|
---|
| 441 |
|
---|
| 442 | sub quote {
|
---|
| 443 | my $self = shift;
|
---|
| 444 | if (@_) {
|
---|
| 445 | my $quote = shift;
|
---|
| 446 | defined $quote or $quote = "";
|
---|
| 447 | utf8::decode ($quote);
|
---|
| 448 | my @b = unpack "U0C*", $quote;
|
---|
| 449 | if (@b > 1) {
|
---|
| 450 | @b > 16 and croak ($self->SetDiag (1007));
|
---|
| 451 | $self->quote_char ("\0");
|
---|
| 452 | }
|
---|
| 453 | else {
|
---|
| 454 | $self->quote_char ($quote);
|
---|
| 455 | $quote = "";
|
---|
| 456 | }
|
---|
| 457 | $self->{quote} = $quote;
|
---|
| 458 |
|
---|
| 459 | my $ec = _check_sanity ($self);
|
---|
| 460 | $ec and croak ($self->SetDiag ($ec));
|
---|
| 461 |
|
---|
| 462 | $self->_cache_set ($_cache_id{quote}, $quote);
|
---|
| 463 | }
|
---|
| 464 | my $quote = $self->{quote};
|
---|
| 465 | defined $quote && length ($quote) ? $quote : $self->{quote_char};
|
---|
| 466 | }
|
---|
| 467 |
|
---|
| 468 | sub escape_char {
|
---|
| 469 | my $self = shift;
|
---|
| 470 | if (@_) {
|
---|
| 471 | my $ec = shift;
|
---|
| 472 | $self->_set_attr_C ("escape_char", $ec);
|
---|
| 473 | $ec or $self->_set_attr_X ("escape_null", 0);
|
---|
| 474 | }
|
---|
| 475 | $self->{escape_char};
|
---|
| 476 | }
|
---|
| 477 |
|
---|
| 478 | sub sep_char {
|
---|
| 479 | my $self = shift;
|
---|
| 480 | if (@_) {
|
---|
| 481 | $self->_set_attr_C ("sep_char", shift);
|
---|
| 482 | $self->_cache_set ($_cache_id{sep}, "");
|
---|
| 483 | }
|
---|
| 484 | $self->{sep_char};
|
---|
| 485 | }
|
---|
| 486 |
|
---|
| 487 | sub sep {
|
---|
| 488 | my $self = shift;
|
---|
| 489 | if (@_) {
|
---|
| 490 | my $sep = shift;
|
---|
| 491 | defined $sep or $sep = "";
|
---|
| 492 | utf8::decode ($sep);
|
---|
| 493 | my @b = unpack "U0C*", $sep;
|
---|
| 494 | if (@b > 1) {
|
---|
| 495 | @b > 16 and croak ($self->SetDiag (1006));
|
---|
| 496 | $self->sep_char ("\0");
|
---|
| 497 | }
|
---|
| 498 | else {
|
---|
| 499 | $self->sep_char ($sep);
|
---|
| 500 | $sep = "";
|
---|
| 501 | }
|
---|
| 502 | $self->{sep} = $sep;
|
---|
| 503 |
|
---|
| 504 | my $ec = _check_sanity ($self);
|
---|
| 505 | $ec and croak ($self->SetDiag ($ec));
|
---|
| 506 |
|
---|
| 507 | $self->_cache_set ($_cache_id{sep}, $sep);
|
---|
| 508 | }
|
---|
| 509 | my $sep = $self->{sep};
|
---|
| 510 | defined $sep && length ($sep) ? $sep : $self->{sep_char};
|
---|
| 511 | }
|
---|
| 512 |
|
---|
| 513 | sub eol {
|
---|
| 514 | my $self = shift;
|
---|
| 515 | if (@_) {
|
---|
| 516 | my $eol = shift;
|
---|
| 517 | defined $eol or $eol = "";
|
---|
| 518 | length ($eol) > 16 and croak ($self->SetDiag (1005));
|
---|
| 519 | $self->{eol} = $eol;
|
---|
| 520 | $self->_cache_set ($_cache_id{eol}, $eol);
|
---|
| 521 | }
|
---|
| 522 | $self->{eol};
|
---|
| 523 | }
|
---|
| 524 |
|
---|
| 525 | sub always_quote {
|
---|
| 526 | my $self = shift;
|
---|
| 527 | @_ and $self->_set_attr_X ("always_quote", shift);
|
---|
| 528 | $self->{always_quote};
|
---|
| 529 | }
|
---|
| 530 |
|
---|
| 531 | sub quote_space {
|
---|
| 532 | my $self = shift;
|
---|
| 533 | @_ and $self->_set_attr_X ("quote_space", shift);
|
---|
| 534 | $self->{quote_space};
|
---|
| 535 | }
|
---|
| 536 |
|
---|
| 537 | sub quote_empty {
|
---|
| 538 | my $self = shift;
|
---|
| 539 | @_ and $self->_set_attr_X ("quote_empty", shift);
|
---|
| 540 | $self->{quote_empty};
|
---|
| 541 | }
|
---|
| 542 |
|
---|
| 543 | sub escape_null {
|
---|
| 544 | my $self = shift;
|
---|
| 545 | @_ and $self->_set_attr_X ("escape_null", shift);
|
---|
| 546 | $self->{escape_null};
|
---|
| 547 | }
|
---|
| 548 |
|
---|
| 549 | sub quote_null { goto &escape_null; }
|
---|
| 550 |
|
---|
| 551 | sub quote_binary {
|
---|
| 552 | my $self = shift;
|
---|
| 553 | @_ and $self->_set_attr_X ("quote_binary", shift);
|
---|
| 554 | $self->{quote_binary};
|
---|
| 555 | }
|
---|
| 556 |
|
---|
| 557 | sub binary {
|
---|
| 558 | my $self = shift;
|
---|
| 559 | @_ and $self->_set_attr_X ("binary", shift);
|
---|
| 560 | $self->{binary};
|
---|
| 561 | }
|
---|
| 562 |
|
---|
| 563 | sub strict {
|
---|
| 564 | my $self = shift;
|
---|
| 565 | @_ and $self->_set_attr_X ("strict", shift);
|
---|
| 566 | $self->{strict};
|
---|
| 567 | }
|
---|
| 568 |
|
---|
| 569 | sub _SetDiagInfo {
|
---|
| 570 | my ($self, $err, $msg) = @_;
|
---|
| 571 | $self->SetDiag ($err);
|
---|
| 572 | my $em = $self->error_diag;
|
---|
| 573 | $em =~ s/^\d+$// and $msg =~ s/^/# /;
|
---|
| 574 | my $sep = $em =~ m/[;\n]$/ ? "\n\t" : ": ";
|
---|
| 575 | join $sep => grep m/\S\S\S/ => $em, $msg;
|
---|
| 576 | }
|
---|
| 577 |
|
---|
| 578 | sub _supported_formula {
|
---|
| 579 | my ($self, $f) = @_;
|
---|
| 580 | defined $f or return 5;
|
---|
| 581 | $f =~ m/^(?: 0 | none )$/xi ? 0 :
|
---|
| 582 | $f =~ m/^(?: 1 | die )$/xi ? 1 :
|
---|
| 583 | $f =~ m/^(?: 2 | croak )$/xi ? 2 :
|
---|
| 584 | $f =~ m/^(?: 3 | diag )$/xi ? 3 :
|
---|
| 585 | $f =~ m/^(?: 4 | empty | )$/xi ? 4 :
|
---|
| 586 | $f =~ m/^(?: 5 | undef )$/xi ? 5 : do {
|
---|
| 587 | $self ||= "Text::CSV_PP";
|
---|
| 588 | croak ($self->_SetDiagInfo (1500, "formula-handling '$f' is not supported"));
|
---|
| 589 | };
|
---|
| 590 | }
|
---|
| 591 |
|
---|
| 592 | sub formula {
|
---|
| 593 | my $self = shift;
|
---|
| 594 | @_ and $self->_set_attr_N ("formula", _supported_formula ($self, shift));
|
---|
| 595 | [qw( none die croak diag empty undef )]->[_supported_formula ($self, $self->{formula})];
|
---|
| 596 | }
|
---|
| 597 | sub formula_handling {
|
---|
| 598 | my $self = shift;
|
---|
| 599 | $self->formula (@_);
|
---|
| 600 | }
|
---|
| 601 |
|
---|
| 602 | sub decode_utf8 {
|
---|
| 603 | my $self = shift;
|
---|
| 604 | @_ and $self->_set_attr_X ("decode_utf8", shift);
|
---|
| 605 | $self->{decode_utf8};
|
---|
| 606 | }
|
---|
| 607 |
|
---|
| 608 | sub keep_meta_info {
|
---|
| 609 | my $self = shift;
|
---|
| 610 | if (@_) {
|
---|
| 611 | my $v = shift;
|
---|
| 612 | !defined $v || $v eq "" and $v = 0;
|
---|
| 613 | $v =~ m/^[0-9]/ or $v = lc $v eq "false" ? 0 : 1; # true/truth = 1
|
---|
| 614 | $self->_set_attr_X ("keep_meta_info", $v);
|
---|
| 615 | }
|
---|
| 616 | $self->{keep_meta_info};
|
---|
| 617 | }
|
---|
| 618 |
|
---|
| 619 | sub allow_loose_quotes {
|
---|
| 620 | my $self = shift;
|
---|
| 621 | @_ and $self->_set_attr_X ("allow_loose_quotes", shift);
|
---|
| 622 | $self->{allow_loose_quotes};
|
---|
| 623 | }
|
---|
| 624 |
|
---|
| 625 | sub allow_loose_escapes {
|
---|
| 626 | my $self = shift;
|
---|
| 627 | @_ and $self->_set_attr_X ("allow_loose_escapes", shift);
|
---|
| 628 | $self->{allow_loose_escapes};
|
---|
| 629 | }
|
---|
| 630 |
|
---|
| 631 | sub allow_whitespace {
|
---|
| 632 | my $self = shift;
|
---|
| 633 | if (@_) {
|
---|
| 634 | my $aw = shift;
|
---|
| 635 | _unhealthy_whitespace ($self, $aw) and
|
---|
| 636 | croak ($self->SetDiag (1002));
|
---|
| 637 | $self->_set_attr_X ("allow_whitespace", $aw);
|
---|
| 638 | }
|
---|
| 639 | $self->{allow_whitespace};
|
---|
| 640 | }
|
---|
| 641 |
|
---|
| 642 | sub allow_unquoted_escape {
|
---|
| 643 | my $self = shift;
|
---|
| 644 | @_ and $self->_set_attr_X ("allow_unquoted_escape", shift);
|
---|
| 645 | $self->{allow_unquoted_escape};
|
---|
| 646 | }
|
---|
| 647 |
|
---|
| 648 | sub blank_is_undef {
|
---|
| 649 | my $self = shift;
|
---|
| 650 | @_ and $self->_set_attr_X ("blank_is_undef", shift);
|
---|
| 651 | $self->{blank_is_undef};
|
---|
| 652 | }
|
---|
| 653 |
|
---|
| 654 | sub empty_is_undef {
|
---|
| 655 | my $self = shift;
|
---|
| 656 | @_ and $self->_set_attr_X ("empty_is_undef", shift);
|
---|
| 657 | $self->{empty_is_undef};
|
---|
| 658 | }
|
---|
| 659 |
|
---|
| 660 | sub verbatim {
|
---|
| 661 | my $self = shift;
|
---|
| 662 | @_ and $self->_set_attr_X ("verbatim", shift);
|
---|
| 663 | $self->{verbatim};
|
---|
| 664 | }
|
---|
| 665 |
|
---|
| 666 | sub undef_str {
|
---|
| 667 | my $self = shift;
|
---|
| 668 | if (@_) {
|
---|
| 669 | my $v = shift;
|
---|
| 670 | $self->{undef_str} = defined $v ? "$v" : undef;
|
---|
| 671 | $self->_cache_set ($_cache_id{undef_str}, $self->{undef_str});
|
---|
| 672 | }
|
---|
| 673 | $self->{undef_str};
|
---|
| 674 | }
|
---|
| 675 |
|
---|
| 676 | sub auto_diag {
|
---|
| 677 | my $self = shift;
|
---|
| 678 | if (@_) {
|
---|
| 679 | my $v = shift;
|
---|
| 680 | !defined $v || $v eq "" and $v = 0;
|
---|
| 681 | $v =~ m/^[0-9]/ or $v = lc $v eq "false" ? 0 : 1; # true/truth = 1
|
---|
| 682 | $self->_set_attr_X ("auto_diag", $v);
|
---|
| 683 | }
|
---|
| 684 | $self->{auto_diag};
|
---|
| 685 | }
|
---|
| 686 |
|
---|
| 687 | sub diag_verbose {
|
---|
| 688 | my $self = shift;
|
---|
| 689 | if (@_) {
|
---|
| 690 | my $v = shift;
|
---|
| 691 | !defined $v || $v eq "" and $v = 0;
|
---|
| 692 | $v =~ m/^[0-9]/ or $v = lc $v eq "false" ? 0 : 1; # true/truth = 1
|
---|
| 693 | $self->_set_attr_X ("diag_verbose", $v);
|
---|
| 694 | }
|
---|
| 695 | $self->{diag_verbose};
|
---|
| 696 | }
|
---|
| 697 |
|
---|
| 698 | ################################################################################
|
---|
| 699 | # status
|
---|
| 700 | ################################################################################
|
---|
| 701 |
|
---|
| 702 | sub status {
|
---|
| 703 | $_[0]->{_STATUS};
|
---|
| 704 | }
|
---|
| 705 |
|
---|
| 706 | sub eof {
|
---|
| 707 | $_[0]->{_EOF};
|
---|
| 708 | }
|
---|
| 709 |
|
---|
| 710 | sub types {
|
---|
| 711 | my $self = shift;
|
---|
| 712 |
|
---|
| 713 | if (@_) {
|
---|
| 714 | if (my $types = shift) {
|
---|
| 715 | $self->{'_types'} = join("", map{ chr($_) } @$types);
|
---|
| 716 | $self->{'types'} = $types;
|
---|
| 717 | }
|
---|
| 718 | else {
|
---|
| 719 | delete $self->{'types'};
|
---|
| 720 | delete $self->{'_types'};
|
---|
| 721 | undef;
|
---|
| 722 | }
|
---|
| 723 | }
|
---|
| 724 | else {
|
---|
| 725 | $self->{'types'};
|
---|
| 726 | }
|
---|
| 727 | }
|
---|
| 728 |
|
---|
| 729 | sub callbacks {
|
---|
| 730 | my $self = shift;
|
---|
| 731 | if (@_) {
|
---|
| 732 | my $cb;
|
---|
| 733 | my $hf = 0x00;
|
---|
| 734 | if (defined $_[0]) {
|
---|
| 735 | grep { !defined } @_ and croak ($self->SetDiag (1004));
|
---|
| 736 | $cb = @_ == 1 && ref $_[0] eq "HASH" ? shift
|
---|
| 737 | : @_ % 2 == 0 ? { @_ }
|
---|
| 738 | : croak ($self->SetDiag (1004));
|
---|
| 739 | foreach my $cbk (keys %$cb) {
|
---|
| 740 | # A key cannot be a ref. That would be stored as the *string
|
---|
| 741 | # 'SCALAR(0x1f3e710)' or 'ARRAY(0x1a5ae18)'
|
---|
| 742 | $cbk =~ m/^[\w.]+$/ && ref $cb->{$cbk} eq "CODE" or
|
---|
| 743 | croak ($self->SetDiag (1004));
|
---|
| 744 | }
|
---|
| 745 | exists $cb->{error} and $hf |= 0x01;
|
---|
| 746 | exists $cb->{after_parse} and $hf |= 0x02;
|
---|
| 747 | exists $cb->{before_print} and $hf |= 0x04;
|
---|
| 748 | }
|
---|
| 749 | elsif (@_ > 1) {
|
---|
| 750 | # (undef, whatever)
|
---|
| 751 | croak ($self->SetDiag (1004));
|
---|
| 752 | }
|
---|
| 753 | $self->_set_attr_X ("_has_hooks", $hf);
|
---|
| 754 | $self->{callbacks} = $cb;
|
---|
| 755 | }
|
---|
| 756 | $self->{callbacks};
|
---|
| 757 | }
|
---|
| 758 |
|
---|
| 759 | ################################################################################
|
---|
| 760 | # error_diag
|
---|
| 761 | ################################################################################
|
---|
| 762 |
|
---|
| 763 | sub error_diag {
|
---|
| 764 | my $self = shift;
|
---|
| 765 | my @diag = (0 + $last_new_error, $last_new_error, 0, 0, 0);
|
---|
| 766 |
|
---|
| 767 | # Docs state to NEVER use UNIVERSAL::isa, because it will *never* call an
|
---|
| 768 | # overridden isa method in any class. Well, that is exacly what I want here
|
---|
| 769 | if ($self && ref $self && # Not a class method or direct call
|
---|
| 770 | UNIVERSAL::isa ($self, __PACKAGE__) && exists $self->{_ERROR_DIAG}) {
|
---|
| 771 | $diag[0] = 0 + $self->{_ERROR_DIAG};
|
---|
| 772 | $diag[1] = $self->{_ERROR_DIAG};
|
---|
| 773 | $diag[2] = 1 + $self->{_ERROR_POS} if exists $self->{_ERROR_POS};
|
---|
| 774 | $diag[3] = $self->{_RECNO};
|
---|
| 775 | $diag[4] = $self->{_ERROR_FLD} if exists $self->{_ERROR_FLD};
|
---|
| 776 |
|
---|
| 777 | $diag[0] && $self->{callbacks} && $self->{callbacks}{error} and
|
---|
| 778 | return $self->{callbacks}{error}->(@diag);
|
---|
| 779 | }
|
---|
| 780 |
|
---|
| 781 | my $context = wantarray;
|
---|
| 782 |
|
---|
| 783 | unless (defined $context) { # Void context, auto-diag
|
---|
| 784 | if ($diag[0] && $diag[0] != 2012) {
|
---|
| 785 | my $msg = "# CSV_PP ERROR: $diag[0] - $diag[1] \@ rec $diag[3] pos $diag[2]\n";
|
---|
| 786 | $diag[4] and $msg =~ s/$/ field $diag[4]/;
|
---|
| 787 |
|
---|
| 788 | unless ($self && ref $self) { # auto_diag
|
---|
| 789 | # called without args in void context
|
---|
| 790 | warn $msg;
|
---|
| 791 | return;
|
---|
| 792 | }
|
---|
| 793 |
|
---|
| 794 | if ($self->{diag_verbose} and $self->{_ERROR_INPUT}) {
|
---|
| 795 | $msg .= "$self->{_ERROR_INPUT}'\n";
|
---|
| 796 | $msg .= " " x ($diag[2] - 1);
|
---|
| 797 | $msg .= "^\n";
|
---|
| 798 | }
|
---|
| 799 |
|
---|
| 800 | my $lvl = $self->{auto_diag};
|
---|
| 801 | if ($lvl < 2) {
|
---|
| 802 | my @c = caller (2);
|
---|
| 803 | if (@c >= 11 && $c[10] && ref $c[10] eq "HASH") {
|
---|
| 804 | my $hints = $c[10];
|
---|
| 805 | (exists $hints->{autodie} && $hints->{autodie} or
|
---|
| 806 | exists $hints->{"guard Fatal"} &&
|
---|
| 807 | !exists $hints->{"no Fatal"}) and
|
---|
| 808 | $lvl++;
|
---|
| 809 | # Future releases of autodie will probably set $^H{autodie}
|
---|
| 810 | # to "autodie @args", like "autodie :all" or "autodie open"
|
---|
| 811 | # so we can/should check for "open" or "new"
|
---|
| 812 | }
|
---|
| 813 | }
|
---|
| 814 | $lvl > 1 ? die $msg : warn $msg;
|
---|
| 815 | }
|
---|
| 816 | return;
|
---|
| 817 | }
|
---|
| 818 |
|
---|
| 819 | return $context ? @diag : $diag[1];
|
---|
| 820 | }
|
---|
| 821 |
|
---|
| 822 | sub record_number {
|
---|
| 823 | return shift->{_RECNO};
|
---|
| 824 | }
|
---|
| 825 |
|
---|
| 826 | ################################################################################
|
---|
| 827 | # string
|
---|
| 828 | ################################################################################
|
---|
| 829 |
|
---|
| 830 | *string = \&_string;
|
---|
| 831 | sub _string {
|
---|
| 832 | defined $_[0]->{_STRING} ? ${ $_[0]->{_STRING} } : undef;
|
---|
| 833 | }
|
---|
| 834 |
|
---|
| 835 | ################################################################################
|
---|
| 836 | # fields
|
---|
| 837 | ################################################################################
|
---|
| 838 |
|
---|
| 839 | *fields = \&_fields;
|
---|
| 840 | sub _fields {
|
---|
| 841 | ref($_[0]->{_FIELDS}) ? @{$_[0]->{_FIELDS}} : undef;
|
---|
| 842 | }
|
---|
| 843 |
|
---|
| 844 | ################################################################################
|
---|
| 845 | # meta_info
|
---|
| 846 | ################################################################################
|
---|
| 847 |
|
---|
| 848 | sub meta_info {
|
---|
| 849 | $_[0]->{_FFLAGS} ? @{ $_[0]->{_FFLAGS} } : undef;
|
---|
| 850 | }
|
---|
| 851 |
|
---|
| 852 | sub is_quoted {
|
---|
| 853 | return unless (defined $_[0]->{_FFLAGS});
|
---|
| 854 | return if( $_[1] =~ /\D/ or $_[1] < 0 or $_[1] > $#{ $_[0]->{_FFLAGS} } );
|
---|
| 855 |
|
---|
| 856 | $_[0]->{_FFLAGS}->[$_[1]] & IS_QUOTED ? 1 : 0;
|
---|
| 857 | }
|
---|
| 858 |
|
---|
| 859 | sub is_binary {
|
---|
| 860 | return unless (defined $_[0]->{_FFLAGS});
|
---|
| 861 | return if( $_[1] =~ /\D/ or $_[1] < 0 or $_[1] > $#{ $_[0]->{_FFLAGS} } );
|
---|
| 862 | $_[0]->{_FFLAGS}->[$_[1]] & IS_BINARY ? 1 : 0;
|
---|
| 863 | }
|
---|
| 864 |
|
---|
| 865 | sub is_missing {
|
---|
| 866 | my ($self, $idx, $val) = @_;
|
---|
| 867 | return unless $self->{keep_meta_info}; # FIXME
|
---|
| 868 | $idx < 0 || !ref $self->{_FFLAGS} and return;
|
---|
| 869 | $idx >= @{$self->{_FFLAGS}} and return 1;
|
---|
| 870 | $self->{_FFLAGS}[$idx] & IS_MISSING ? 1 : 0;
|
---|
| 871 | }
|
---|
| 872 |
|
---|
| 873 | ################################################################################
|
---|
| 874 | # combine
|
---|
| 875 | ################################################################################
|
---|
| 876 | *combine = \&_combine;
|
---|
| 877 | sub _combine {
|
---|
| 878 | my ($self, @fields) = @_;
|
---|
| 879 | my $str = "";
|
---|
| 880 | $self->{_FIELDS} = \@fields;
|
---|
| 881 | $self->{_STATUS} = (@fields > 0) && $self->__combine(\$str, \@fields, 0);
|
---|
| 882 | $self->{_STRING} = \$str;
|
---|
| 883 | $self->{_STATUS};
|
---|
| 884 | }
|
---|
| 885 |
|
---|
| 886 | ################################################################################
|
---|
| 887 | # parse
|
---|
| 888 | ################################################################################
|
---|
| 889 | *parse = \&_parse;
|
---|
| 890 | sub _parse {
|
---|
| 891 | my ($self, $str) = @_;
|
---|
| 892 |
|
---|
| 893 | ref $str and croak ($self->SetDiag (1500));
|
---|
| 894 |
|
---|
| 895 | my $fields = [];
|
---|
| 896 | my $fflags = [];
|
---|
| 897 | $self->{_STRING} = \$str;
|
---|
| 898 | if (defined $str && $self->__parse ($fields, $fflags, $str, 0)) {
|
---|
| 899 | $self->{_FIELDS} = $fields;
|
---|
| 900 | $self->{_FFLAGS} = $fflags;
|
---|
| 901 | $self->{_STATUS} = 1;
|
---|
| 902 | }
|
---|
| 903 | else {
|
---|
| 904 | $self->{_FIELDS} = undef;
|
---|
| 905 | $self->{_FFLAGS} = undef;
|
---|
| 906 | $self->{_STATUS} = 0;
|
---|
| 907 | }
|
---|
| 908 | $self->{_STATUS};
|
---|
| 909 | }
|
---|
| 910 |
|
---|
| 911 | sub column_names {
|
---|
| 912 | my ( $self, @columns ) = @_;
|
---|
| 913 |
|
---|
| 914 | @columns or return defined $self->{_COLUMN_NAMES} ? @{$self->{_COLUMN_NAMES}} : ();
|
---|
| 915 | @columns == 1 && ! defined $columns[0] and return $self->{_COLUMN_NAMES} = undef;
|
---|
| 916 |
|
---|
| 917 | if ( @columns == 1 && ref $columns[0] eq "ARRAY" ) {
|
---|
| 918 | @columns = @{ $columns[0] };
|
---|
| 919 | }
|
---|
| 920 | elsif ( join "", map { defined $_ ? ref $_ : "" } @columns ) {
|
---|
| 921 | croak $self->SetDiag( 3001 );
|
---|
| 922 | }
|
---|
| 923 |
|
---|
| 924 | if ( $self->{_BOUND_COLUMNS} && @columns != @{$self->{_BOUND_COLUMNS}} ) {
|
---|
| 925 | croak $self->SetDiag( 3003 );
|
---|
| 926 | }
|
---|
| 927 |
|
---|
| 928 | $self->{_COLUMN_NAMES} = [ map { defined $_ ? $_ : "\cAUNDEF\cA" } @columns ];
|
---|
| 929 | @{ $self->{_COLUMN_NAMES} };
|
---|
| 930 | }
|
---|
| 931 |
|
---|
| 932 | sub header {
|
---|
| 933 | my ($self, $fh, @args) = @_;
|
---|
| 934 |
|
---|
| 935 | $fh or croak ($self->SetDiag (1014));
|
---|
| 936 |
|
---|
| 937 | my (@seps, %args);
|
---|
| 938 | for (@args) {
|
---|
| 939 | if (ref $_ eq "ARRAY") {
|
---|
| 940 | push @seps, @$_;
|
---|
| 941 | next;
|
---|
| 942 | }
|
---|
| 943 | if (ref $_ eq "HASH") {
|
---|
| 944 | %args = %$_;
|
---|
| 945 | next;
|
---|
| 946 | }
|
---|
| 947 | croak (q{usage: $csv->header ($fh, [ seps ], { options })});
|
---|
| 948 | }
|
---|
| 949 |
|
---|
| 950 | defined $args{munge} && !defined $args{munge_column_names} and
|
---|
| 951 | $args{munge_column_names} = $args{munge}; # munge as alias
|
---|
| 952 | defined $args{detect_bom} or $args{detect_bom} = 1;
|
---|
| 953 | defined $args{set_column_names} or $args{set_column_names} = 1;
|
---|
| 954 | defined $args{munge_column_names} or $args{munge_column_names} = "lc";
|
---|
| 955 |
|
---|
| 956 | # Reset any previous leftovers
|
---|
| 957 | $self->{_RECNO} = 0;
|
---|
| 958 | $self->{_AHEAD} = undef;
|
---|
| 959 | $self->{_COLUMN_NAMES} = undef if $args{set_column_names};
|
---|
| 960 | $self->{_BOUND_COLUMNS} = undef if $args{set_column_names};
|
---|
| 961 | $self->_cache_set($_cache_id{'_has_ahead'}, 0);
|
---|
| 962 |
|
---|
| 963 | if (defined $args{sep_set}) {
|
---|
| 964 | ref $args{sep_set} eq "ARRAY" or
|
---|
| 965 | croak ($self->_SetDiagInfo (1500, "sep_set should be an array ref"));
|
---|
| 966 | @seps = @{$args{sep_set}};
|
---|
| 967 | }
|
---|
| 968 |
|
---|
| 969 | $^O eq "MSWin32" and binmode $fh;
|
---|
| 970 | my $hdr = <$fh>;
|
---|
| 971 | # check if $hdr can be empty here, I don't think so
|
---|
| 972 | defined $hdr && $hdr ne "" or croak ($self->SetDiag (1010));
|
---|
| 973 |
|
---|
| 974 | my %sep;
|
---|
| 975 | @seps or @seps = (",", ";");
|
---|
| 976 | foreach my $sep (@seps) {
|
---|
| 977 | index ($hdr, $sep) >= 0 and $sep{$sep}++;
|
---|
| 978 | }
|
---|
| 979 |
|
---|
| 980 | keys %sep >= 2 and croak ($self->SetDiag (1011));
|
---|
| 981 |
|
---|
| 982 | $self->sep (keys %sep);
|
---|
| 983 | my $enc = "";
|
---|
| 984 | if ($args{detect_bom}) { # UTF-7 is not supported
|
---|
| 985 | if ($hdr =~ s/^\x00\x00\xfe\xff//) { $enc = "utf-32be" }
|
---|
| 986 | elsif ($hdr =~ s/^\xff\xfe\x00\x00//) { $enc = "utf-32le" }
|
---|
| 987 | elsif ($hdr =~ s/^\xfe\xff//) { $enc = "utf-16be" }
|
---|
| 988 | elsif ($hdr =~ s/^\xff\xfe//) { $enc = "utf-16le" }
|
---|
| 989 | elsif ($hdr =~ s/^\xef\xbb\xbf//) { $enc = "utf-8" }
|
---|
| 990 | elsif ($hdr =~ s/^\xf7\x64\x4c//) { $enc = "utf-1" }
|
---|
| 991 | elsif ($hdr =~ s/^\xdd\x73\x66\x73//) { $enc = "utf-ebcdic" }
|
---|
| 992 | elsif ($hdr =~ s/^\x0e\xfe\xff//) { $enc = "scsu" }
|
---|
| 993 | elsif ($hdr =~ s/^\xfb\xee\x28//) { $enc = "bocu-1" }
|
---|
| 994 | elsif ($hdr =~ s/^\x84\x31\x95\x33//) { $enc = "gb-18030" }
|
---|
| 995 | elsif ($hdr =~ s/^\x{feff}//) { $enc = "" }
|
---|
| 996 |
|
---|
| 997 | $self->{ENCODING} = uc $enc;
|
---|
| 998 |
|
---|
| 999 | $hdr eq "" and croak ($self->SetDiag (1010));
|
---|
| 1000 |
|
---|
| 1001 | if ($enc) {
|
---|
| 1002 | if ($enc =~ m/([13]).le$/) {
|
---|
| 1003 | my $l = 0 + $1;
|
---|
| 1004 | my $x;
|
---|
| 1005 | $hdr .= "\0" x $l;
|
---|
| 1006 | read $fh, $x, $l;
|
---|
| 1007 | }
|
---|
| 1008 | if ($enc ne "utf-8") {
|
---|
| 1009 | require Encode;
|
---|
| 1010 | $hdr = Encode::decode ($enc, $hdr);
|
---|
| 1011 | }
|
---|
| 1012 | binmode $fh, ":encoding($enc)";
|
---|
| 1013 | }
|
---|
| 1014 | }
|
---|
| 1015 |
|
---|
| 1016 | my ($ahead, $eol);
|
---|
| 1017 | if ($hdr =~ s/^([^\r\n]+)([\r\n]+)([^\r\n].+)\z/$1/s) {
|
---|
| 1018 | $eol = $2;
|
---|
| 1019 | $ahead = $3;
|
---|
| 1020 | }
|
---|
| 1021 |
|
---|
| 1022 | $args{munge_column_names} eq "lc" and $hdr = lc $hdr;
|
---|
| 1023 | $args{munge_column_names} eq "uc" and $hdr = uc $hdr;
|
---|
| 1024 |
|
---|
| 1025 | my $hr = \$hdr; # Will cause croak on perl-5.6.x
|
---|
| 1026 | open my $h, "<", $hr or croak ($self->SetDiag (1010));
|
---|
| 1027 |
|
---|
| 1028 | my $row = $self->getline ($h) or croak;
|
---|
| 1029 | close $h;
|
---|
| 1030 |
|
---|
| 1031 | if ($ahead) { # Must be after getline, which creates the cache
|
---|
| 1032 | $self->_cache_set ($_cache_id{_has_ahead}, 1);
|
---|
| 1033 | $self->{_AHEAD} = $ahead;
|
---|
| 1034 | $eol =~ m/^\r([^\n]|\z)/ and $self->eol ($eol);
|
---|
| 1035 | }
|
---|
| 1036 |
|
---|
| 1037 | my @hdr = @$row;
|
---|
| 1038 | ref $args{munge_column_names} eq "CODE" and
|
---|
| 1039 | @hdr = map { $args{munge_column_names}->($_) } @hdr;
|
---|
| 1040 | ref $args{munge_column_names} eq "HASH" and
|
---|
| 1041 | @hdr = map { $args{munge_column_names}->{$_} || $_ } @hdr;
|
---|
| 1042 | my %hdr; $hdr{$_}++ for @hdr;
|
---|
| 1043 | exists $hdr{""} and croak ($self->SetDiag (1012));
|
---|
| 1044 | unless (keys %hdr == @hdr) {
|
---|
| 1045 | croak ($self->_SetDiagInfo (1013, join ", " =>
|
---|
| 1046 | map { "$_ ($hdr{$_})" } grep { $hdr{$_} > 1 } keys %hdr));
|
---|
| 1047 | }
|
---|
| 1048 | $args{set_column_names} and $self->column_names (@hdr);
|
---|
| 1049 | wantarray ? @hdr : $self;
|
---|
| 1050 | }
|
---|
| 1051 |
|
---|
| 1052 | sub bind_columns {
|
---|
| 1053 | my ( $self, @refs ) = @_;
|
---|
| 1054 |
|
---|
| 1055 | @refs or return defined $self->{_BOUND_COLUMNS} ? @{$self->{_BOUND_COLUMNS}} : undef;
|
---|
| 1056 | @refs == 1 && ! defined $refs[0] and return $self->{_BOUND_COLUMNS} = undef;
|
---|
| 1057 |
|
---|
| 1058 | if ( $self->{_COLUMN_NAMES} && @refs != @{$self->{_COLUMN_NAMES}} ) {
|
---|
| 1059 | croak $self->SetDiag( 3003 );
|
---|
| 1060 | }
|
---|
| 1061 |
|
---|
| 1062 | if ( grep { ref $_ ne "SCALAR" } @refs ) { # why don't use grep?
|
---|
| 1063 | croak $self->SetDiag( 3004 );
|
---|
| 1064 | }
|
---|
| 1065 |
|
---|
| 1066 | $self->_set_attr_N("_is_bound", scalar @refs);
|
---|
| 1067 | $self->{_BOUND_COLUMNS} = [ @refs ];
|
---|
| 1068 | @refs;
|
---|
| 1069 | }
|
---|
| 1070 |
|
---|
| 1071 | sub getline_hr {
|
---|
| 1072 | my ($self, @args, %hr) = @_;
|
---|
| 1073 | $self->{_COLUMN_NAMES} or croak ($self->SetDiag (3002));
|
---|
| 1074 | my $fr = $self->getline (@args) or return;
|
---|
| 1075 | if (ref $self->{_FFLAGS}) { # missing
|
---|
| 1076 | $self->{_FFLAGS}[$_] = IS_MISSING
|
---|
| 1077 | for (@$fr ? $#{$fr} + 1 : 0) .. $#{$self->{_COLUMN_NAMES}};
|
---|
| 1078 | @$fr == 1 && (!defined $fr->[0] || $fr->[0] eq "") and
|
---|
| 1079 | $self->{_FFLAGS}[0] ||= IS_MISSING;
|
---|
| 1080 | }
|
---|
| 1081 | @hr{@{$self->{_COLUMN_NAMES}}} = @$fr;
|
---|
| 1082 | \%hr;
|
---|
| 1083 | }
|
---|
| 1084 |
|
---|
| 1085 | sub getline_hr_all {
|
---|
| 1086 | my ( $self, $io, @args ) = @_;
|
---|
| 1087 |
|
---|
| 1088 | unless ( $self->{_COLUMN_NAMES} ) {
|
---|
| 1089 | croak $self->SetDiag( 3002 );
|
---|
| 1090 | }
|
---|
| 1091 |
|
---|
| 1092 | my @cn = @{$self->{_COLUMN_NAMES}};
|
---|
| 1093 |
|
---|
| 1094 | return [ map { my %h; @h{ @cn } = @$_; \%h } @{ $self->getline_all( $io, @args ) } ];
|
---|
| 1095 | }
|
---|
| 1096 |
|
---|
| 1097 | sub say {
|
---|
| 1098 | my ($self, $io, @f) = @_;
|
---|
| 1099 | my $eol = $self->eol;
|
---|
| 1100 | $eol eq "" and $self->eol ($\ || $/);
|
---|
| 1101 | # say ($fh, undef) does not propage actual undef to print ()
|
---|
| 1102 | my $state = $self->print ($io, @f == 1 && !defined $f[0] ? undef : @f);
|
---|
| 1103 | $self->eol ($eol);
|
---|
| 1104 | return $state;
|
---|
| 1105 | }
|
---|
| 1106 |
|
---|
| 1107 | sub print_hr {
|
---|
| 1108 | my ($self, $io, $hr) = @_;
|
---|
| 1109 | $self->{_COLUMN_NAMES} or croak($self->SetDiag(3009));
|
---|
| 1110 | ref $hr eq "HASH" or croak($self->SetDiag(3010));
|
---|
| 1111 | $self->print ($io, [ map { $hr->{$_} } $self->column_names ]);
|
---|
| 1112 | }
|
---|
| 1113 |
|
---|
| 1114 | sub fragment {
|
---|
| 1115 | my ($self, $io, $spec) = @_;
|
---|
| 1116 |
|
---|
| 1117 | my $qd = qr{\s* [0-9]+ \s* }x; # digit
|
---|
| 1118 | my $qs = qr{\s* (?: [0-9]+ | \* ) \s*}x; # digit or star
|
---|
| 1119 | my $qr = qr{$qd (?: - $qs )?}x; # range
|
---|
| 1120 | my $qc = qr{$qr (?: ; $qr )*}x; # list
|
---|
| 1121 | defined $spec && $spec =~ m{^ \s*
|
---|
| 1122 | \x23 ? \s* # optional leading #
|
---|
| 1123 | ( row | col | cell ) \s* =
|
---|
| 1124 | ( $qc # for row and col
|
---|
| 1125 | | $qd , $qd (?: - $qs , $qs)? # for cell (ranges)
|
---|
| 1126 | (?: ; $qd , $qd (?: - $qs , $qs)? )* # and cell (range) lists
|
---|
| 1127 | ) \s* $}xi or croak ($self->SetDiag (2013));
|
---|
| 1128 | my ($type, $range) = (lc $1, $2);
|
---|
| 1129 |
|
---|
| 1130 | my @h = $self->column_names ();
|
---|
| 1131 |
|
---|
| 1132 | my @c;
|
---|
| 1133 | if ($type eq "cell") {
|
---|
| 1134 | my @spec;
|
---|
| 1135 | my $min_row;
|
---|
| 1136 | my $max_row = 0;
|
---|
| 1137 | for (split m/\s*;\s*/ => $range) {
|
---|
| 1138 | my ($tlr, $tlc, $brr, $brc) = (m{
|
---|
| 1139 | ^ \s* ([0-9]+ ) \s* , \s* ([0-9]+ ) \s*
|
---|
| 1140 | (?: - \s* ([0-9]+ | \*) \s* , \s* ([0-9]+ | \*) \s* )?
|
---|
| 1141 | $}x) or croak ($self->SetDiag (2013));
|
---|
| 1142 | defined $brr or ($brr, $brc) = ($tlr, $tlc);
|
---|
| 1143 | $tlr == 0 || $tlc == 0 ||
|
---|
| 1144 | ($brr ne "*" && ($brr == 0 || $brr < $tlr)) ||
|
---|
| 1145 | ($brc ne "*" && ($brc == 0 || $brc < $tlc))
|
---|
| 1146 | and croak ($self->SetDiag (2013));
|
---|
| 1147 | $tlc--;
|
---|
| 1148 | $brc-- unless $brc eq "*";
|
---|
| 1149 | defined $min_row or $min_row = $tlr;
|
---|
| 1150 | $tlr < $min_row and $min_row = $tlr;
|
---|
| 1151 | $brr eq "*" || $brr > $max_row and
|
---|
| 1152 | $max_row = $brr;
|
---|
| 1153 | push @spec, [ $tlr, $tlc, $brr, $brc ];
|
---|
| 1154 | }
|
---|
| 1155 | my $r = 0;
|
---|
| 1156 | while (my $row = $self->getline ($io)) {
|
---|
| 1157 | ++$r < $min_row and next;
|
---|
| 1158 | my %row;
|
---|
| 1159 | my $lc;
|
---|
| 1160 | foreach my $s (@spec) {
|
---|
| 1161 | my ($tlr, $tlc, $brr, $brc) = @$s;
|
---|
| 1162 | $r < $tlr || ($brr ne "*" && $r > $brr) and next;
|
---|
| 1163 | !defined $lc || $tlc < $lc and $lc = $tlc;
|
---|
| 1164 | my $rr = $brc eq "*" ? $#$row : $brc;
|
---|
| 1165 | $row{$_} = $row->[$_] for $tlc .. $rr;
|
---|
| 1166 | }
|
---|
| 1167 | push @c, [ @row{sort { $a <=> $b } keys %row } ];
|
---|
| 1168 | if (@h) {
|
---|
| 1169 | my %h; @h{@h} = @{$c[-1]};
|
---|
| 1170 | $c[-1] = \%h;
|
---|
| 1171 | }
|
---|
| 1172 | $max_row ne "*" && $r == $max_row and last;
|
---|
| 1173 | }
|
---|
| 1174 | return \@c;
|
---|
| 1175 | }
|
---|
| 1176 |
|
---|
| 1177 | # row or col
|
---|
| 1178 | my @r;
|
---|
| 1179 | my $eod = 0;
|
---|
| 1180 | for (split m/\s*;\s*/ => $range) {
|
---|
| 1181 | my ($from, $to) = m/^\s* ([0-9]+) (?: \s* - \s* ([0-9]+ | \* ))? \s* $/x
|
---|
| 1182 | or croak ($self->SetDiag (2013));
|
---|
| 1183 | $to ||= $from;
|
---|
| 1184 | $to eq "*" and ($to, $eod) = ($from, 1);
|
---|
| 1185 | # $to cannot be <= 0 due to regex and ||=
|
---|
| 1186 | $from <= 0 || $to < $from and croak ($self->SetDiag (2013));
|
---|
| 1187 | $r[$_] = 1 for $from .. $to;
|
---|
| 1188 | }
|
---|
| 1189 |
|
---|
| 1190 | my $r = 0;
|
---|
| 1191 | $type eq "col" and shift @r;
|
---|
| 1192 | $_ ||= 0 for @r;
|
---|
| 1193 | while (my $row = $self->getline ($io)) {
|
---|
| 1194 | $r++;
|
---|
| 1195 | if ($type eq "row") {
|
---|
| 1196 | if (($r > $#r && $eod) || $r[$r]) {
|
---|
| 1197 | push @c, $row;
|
---|
| 1198 | if (@h) {
|
---|
| 1199 | my %h; @h{@h} = @{$c[-1]};
|
---|
| 1200 | $c[-1] = \%h;
|
---|
| 1201 | }
|
---|
| 1202 | }
|
---|
| 1203 | next;
|
---|
| 1204 | }
|
---|
| 1205 | push @c, [ map { ($_ > $#r && $eod) || $r[$_] ? $row->[$_] : () } 0..$#$row ];
|
---|
| 1206 | if (@h) {
|
---|
| 1207 | my %h; @h{@h} = @{$c[-1]};
|
---|
| 1208 | $c[-1] = \%h;
|
---|
| 1209 | }
|
---|
| 1210 | }
|
---|
| 1211 |
|
---|
| 1212 | return \@c;
|
---|
| 1213 | }
|
---|
| 1214 |
|
---|
| 1215 | my $csv_usage = q{usage: my $aoa = csv (in => $file);};
|
---|
| 1216 |
|
---|
| 1217 | sub _csv_attr {
|
---|
| 1218 | my %attr = (@_ == 1 && ref $_[0] eq "HASH" ? %{$_[0]} : @_) or croak;
|
---|
| 1219 |
|
---|
| 1220 | $attr{binary} = 1;
|
---|
| 1221 |
|
---|
| 1222 | my $enc = delete $attr{enc} || delete $attr{encoding} || "";
|
---|
| 1223 | $enc eq "auto" and ($attr{detect_bom}, $enc) = (1, "");
|
---|
| 1224 | $enc =~ m/^[-\w.]+$/ and $enc = ":encoding($enc)";
|
---|
| 1225 |
|
---|
| 1226 | my $fh;
|
---|
| 1227 | my $sink = 0;
|
---|
| 1228 | my $cls = 0; # If I open a file, I have to close it
|
---|
| 1229 | my $in = delete $attr{in} || delete $attr{file} or croak $csv_usage;
|
---|
| 1230 | my $out = exists $attr{out} && !$attr{out} ? \"skip"
|
---|
| 1231 | : delete $attr{out} || delete $attr{file};
|
---|
| 1232 |
|
---|
| 1233 | ref $in eq "CODE" || ref $in eq "ARRAY" and $out ||= \*STDOUT;
|
---|
| 1234 |
|
---|
| 1235 | $in && $out && !ref $in && !ref $out and croak join "\n" =>
|
---|
| 1236 | qq{Cannot use a string for both in and out. Instead use:},
|
---|
| 1237 | qq{ csv (in => csv (in => "$in"), out => "$out");\n};
|
---|
| 1238 |
|
---|
| 1239 | if ($out) {
|
---|
| 1240 | if ((ref $out and "SCALAR" ne ref $out) or "GLOB" eq ref \$out) {
|
---|
| 1241 | $fh = $out;
|
---|
| 1242 | }
|
---|
| 1243 | elsif (ref $out and "SCALAR" eq ref $out and defined $$out and $$out eq "skip") {
|
---|
| 1244 | delete $attr{out};
|
---|
| 1245 | $sink = 1;
|
---|
| 1246 | }
|
---|
| 1247 | else {
|
---|
| 1248 | open $fh, ">", $out or croak "$out: $!";
|
---|
| 1249 | $cls = 1;
|
---|
| 1250 | }
|
---|
| 1251 | if ($fh) {
|
---|
| 1252 | $enc and binmode $fh, $enc;
|
---|
| 1253 | unless (defined $attr{eol}) {
|
---|
| 1254 | my @layers = eval { PerlIO::get_layers ($fh) };
|
---|
| 1255 | $attr{eol} = (grep m/crlf/ => @layers) ? "\n" : "\r\n";
|
---|
| 1256 | }
|
---|
| 1257 | }
|
---|
| 1258 | }
|
---|
| 1259 |
|
---|
| 1260 | if ( ref $in eq "CODE" or ref $in eq "ARRAY") {
|
---|
| 1261 | # All done
|
---|
| 1262 | }
|
---|
| 1263 | elsif (ref $in eq "SCALAR") {
|
---|
| 1264 | # Strings with code points over 0xFF may not be mapped into in-memory file handles
|
---|
| 1265 | # "<$enc" does not change that :(
|
---|
| 1266 | open $fh, "<", $in or croak "Cannot open from SCALAR using PerlIO";
|
---|
| 1267 | $cls = 1;
|
---|
| 1268 | }
|
---|
| 1269 | elsif (ref $in or "GLOB" eq ref \$in) {
|
---|
| 1270 | if (!ref $in && $] < 5.008005) {
|
---|
| 1271 | $fh = \*$in; # uncoverable statement ancient perl version required
|
---|
| 1272 | }
|
---|
| 1273 | else {
|
---|
| 1274 | $fh = $in;
|
---|
| 1275 | }
|
---|
| 1276 | }
|
---|
| 1277 | else {
|
---|
| 1278 | open $fh, "<$enc", $in or croak "$in: $!";
|
---|
| 1279 | $cls = 1;
|
---|
| 1280 | }
|
---|
| 1281 | $fh || $sink or croak qq{No valid source passed. "in" is required};
|
---|
| 1282 |
|
---|
| 1283 | my $hdrs = delete $attr{headers};
|
---|
| 1284 | my $frag = delete $attr{fragment};
|
---|
| 1285 | my $key = delete $attr{key};
|
---|
| 1286 | my $val = delete $attr{value};
|
---|
| 1287 | my $kh = delete $attr{keep_headers} ||
|
---|
| 1288 | delete $attr{keep_column_names} ||
|
---|
| 1289 | delete $attr{kh};
|
---|
| 1290 |
|
---|
| 1291 | my $cbai = delete $attr{callbacks}{after_in} ||
|
---|
| 1292 | delete $attr{after_in} ||
|
---|
| 1293 | delete $attr{callbacks}{after_parse} ||
|
---|
| 1294 | delete $attr{after_parse};
|
---|
| 1295 | my $cbbo = delete $attr{callbacks}{before_out} ||
|
---|
| 1296 | delete $attr{before_out};
|
---|
| 1297 | my $cboi = delete $attr{callbacks}{on_in} ||
|
---|
| 1298 | delete $attr{on_in};
|
---|
| 1299 |
|
---|
| 1300 | my $hd_s = delete $attr{sep_set} ||
|
---|
| 1301 | delete $attr{seps};
|
---|
| 1302 | my $hd_b = delete $attr{detect_bom} ||
|
---|
| 1303 | delete $attr{bom};
|
---|
| 1304 | my $hd_m = delete $attr{munge} ||
|
---|
| 1305 | delete $attr{munge_column_names};
|
---|
| 1306 | my $hd_c = delete $attr{set_column_names};
|
---|
| 1307 |
|
---|
| 1308 | for ([ quo => "quote" ],
|
---|
| 1309 | [ esc => "escape" ],
|
---|
| 1310 | [ escape => "escape_char" ],
|
---|
| 1311 | ) {
|
---|
| 1312 | my ($f, $t) = @$_;
|
---|
| 1313 | exists $attr{$f} and !exists $attr{$t} and $attr{$t} = delete $attr{$f};
|
---|
| 1314 | }
|
---|
| 1315 |
|
---|
| 1316 | my $fltr = delete $attr{filter};
|
---|
| 1317 | my %fltr = (
|
---|
| 1318 | not_blank => sub { @{$_[1]} > 1 or defined $_[1][0] && $_[1][0] ne "" },
|
---|
| 1319 | not_empty => sub { grep { defined && $_ ne "" } @{$_[1]} },
|
---|
| 1320 | filled => sub { grep { defined && m/\S/ } @{$_[1]} },
|
---|
| 1321 | );
|
---|
| 1322 | defined $fltr && !ref $fltr && exists $fltr{$fltr} and
|
---|
| 1323 | $fltr = { 0 => $fltr{$fltr} };
|
---|
| 1324 | ref $fltr eq "CODE" and $fltr = { 0 => $fltr };
|
---|
| 1325 | ref $fltr eq "HASH" or $fltr = undef;
|
---|
| 1326 |
|
---|
| 1327 | exists $attr{formula} and
|
---|
| 1328 | $attr{formula} = _supported_formula (undef, $attr{formula});
|
---|
| 1329 |
|
---|
| 1330 | defined $attr{auto_diag} or $attr{auto_diag} = 1;
|
---|
| 1331 | defined $attr{escape_null} or $attr{escape_null} = 0;
|
---|
| 1332 | my $csv = delete $attr{csv} || Text::CSV_PP->new (\%attr)
|
---|
| 1333 | or croak $last_new_error;
|
---|
| 1334 |
|
---|
| 1335 | return {
|
---|
| 1336 | csv => $csv,
|
---|
| 1337 | attr => { %attr },
|
---|
| 1338 | fh => $fh,
|
---|
| 1339 | cls => $cls,
|
---|
| 1340 | in => $in,
|
---|
| 1341 | sink => $sink,
|
---|
| 1342 | out => $out,
|
---|
| 1343 | enc => $enc,
|
---|
| 1344 | hdrs => $hdrs,
|
---|
| 1345 | key => $key,
|
---|
| 1346 | val => $val,
|
---|
| 1347 | kh => $kh,
|
---|
| 1348 | frag => $frag,
|
---|
| 1349 | fltr => $fltr,
|
---|
| 1350 | cbai => $cbai,
|
---|
| 1351 | cbbo => $cbbo,
|
---|
| 1352 | cboi => $cboi,
|
---|
| 1353 | hd_s => $hd_s,
|
---|
| 1354 | hd_b => $hd_b,
|
---|
| 1355 | hd_m => $hd_m,
|
---|
| 1356 | hd_c => $hd_c,
|
---|
| 1357 | };
|
---|
| 1358 | }
|
---|
| 1359 |
|
---|
| 1360 | sub csv {
|
---|
| 1361 | @_ && (ref $_[0] eq __PACKAGE__ or ref $_[0] eq 'Text::CSV') and splice @_, 0, 0, "csv";
|
---|
| 1362 | @_ or croak $csv_usage;
|
---|
| 1363 |
|
---|
| 1364 | my $c = _csv_attr (@_);
|
---|
| 1365 |
|
---|
| 1366 | my ($csv, $in, $fh, $hdrs) = @{$c}{"csv", "in", "fh", "hdrs"};
|
---|
| 1367 | my %hdr;
|
---|
| 1368 | if (ref $hdrs eq "HASH") {
|
---|
| 1369 | %hdr = %$hdrs;
|
---|
| 1370 | $hdrs = "auto";
|
---|
| 1371 | }
|
---|
| 1372 |
|
---|
| 1373 | if ($c->{out} && !$c->{sink}) {
|
---|
| 1374 | if (ref $in eq "CODE") {
|
---|
| 1375 | my $hdr = 1;
|
---|
| 1376 | while (my $row = $in->($csv)) {
|
---|
| 1377 | if (ref $row eq "ARRAY") {
|
---|
| 1378 | $csv->print ($fh, $row);
|
---|
| 1379 | next;
|
---|
| 1380 | }
|
---|
| 1381 | if (ref $row eq "HASH") {
|
---|
| 1382 | if ($hdr) {
|
---|
| 1383 | $hdrs ||= [ map { $hdr{$_} || $_ } keys %$row ];
|
---|
| 1384 | $csv->print ($fh, $hdrs);
|
---|
| 1385 | $hdr = 0;
|
---|
| 1386 | }
|
---|
| 1387 | $csv->print ($fh, [ @{$row}{@$hdrs} ]);
|
---|
| 1388 | }
|
---|
| 1389 | }
|
---|
| 1390 | }
|
---|
| 1391 | elsif (ref $in->[0] eq "ARRAY") { # aoa
|
---|
| 1392 | ref $hdrs and $csv->print ($fh, $hdrs);
|
---|
| 1393 | for (@{$in}) {
|
---|
| 1394 | $c->{cboi} and $c->{cboi}->($csv, $_);
|
---|
| 1395 | $c->{cbbo} and $c->{cbbo}->($csv, $_);
|
---|
| 1396 | $csv->print ($fh, $_);
|
---|
| 1397 | }
|
---|
| 1398 | }
|
---|
| 1399 | else { # aoh
|
---|
| 1400 | my @hdrs = ref $hdrs ? @{$hdrs} : keys %{$in->[0]};
|
---|
| 1401 | defined $hdrs or $hdrs = "auto";
|
---|
| 1402 | ref $hdrs || $hdrs eq "auto" and
|
---|
| 1403 | $csv->print ($fh, [ map { $hdr{$_} || $_ } @hdrs ]);
|
---|
| 1404 | for (@{$in}) {
|
---|
| 1405 | local %_;
|
---|
| 1406 | *_ = $_;
|
---|
| 1407 | $c->{cboi} and $c->{cboi}->($csv, $_);
|
---|
| 1408 | $c->{cbbo} and $c->{cbbo}->($csv, $_);
|
---|
| 1409 | $csv->print ($fh, [ @{$_}{@hdrs} ]);
|
---|
| 1410 | }
|
---|
| 1411 | }
|
---|
| 1412 |
|
---|
| 1413 | $c->{cls} and close $fh;
|
---|
| 1414 | return 1;
|
---|
| 1415 | }
|
---|
| 1416 |
|
---|
| 1417 | my @row1;
|
---|
| 1418 | if (defined $c->{hd_s} || defined $c->{hd_b} || defined $c->{hd_m} || defined $c->{hd_c}) {
|
---|
| 1419 | my %harg;
|
---|
| 1420 | defined $c->{hd_s} and $harg{set_set} = $c->{hd_s};
|
---|
| 1421 | defined $c->{hd_d} and $harg{detect_bom} = $c->{hd_b};
|
---|
| 1422 | defined $c->{hd_m} and $harg{munge_column_names} = $hdrs ? "none" : $c->{hd_m};
|
---|
| 1423 | defined $c->{hd_c} and $harg{set_column_names} = $hdrs ? 0 : $c->{hd_c};
|
---|
| 1424 | @row1 = $csv->header ($fh, \%harg);
|
---|
| 1425 | my @hdr = $csv->column_names;
|
---|
| 1426 | @hdr and $hdrs ||= \@hdr;
|
---|
| 1427 | }
|
---|
| 1428 |
|
---|
| 1429 | if ($c->{kh}) {
|
---|
| 1430 | ref $c->{kh} eq "ARRAY" or croak ($csv->SetDiag (1501));
|
---|
| 1431 | $hdrs ||= "auto";
|
---|
| 1432 | }
|
---|
| 1433 |
|
---|
| 1434 | my $key = $c->{key};
|
---|
| 1435 | if ($key) {
|
---|
| 1436 | !ref $key or ref $key eq "ARRAY" && @$key > 1 or croak ($csv->SetDiag (1501));
|
---|
| 1437 | $hdrs ||= "auto";
|
---|
| 1438 | }
|
---|
| 1439 | my $val = $c->{val};
|
---|
| 1440 | if ($val) {
|
---|
| 1441 | $key or croak ($csv->SetDiag (1502));
|
---|
| 1442 | !ref $val or ref $val eq "ARRAY" && @$val > 0 or croak ($csv->SetDiag (1503));
|
---|
| 1443 | }
|
---|
| 1444 |
|
---|
| 1445 | $c->{fltr} && grep m/\D/ => keys %{$c->{fltr}} and $hdrs ||= "auto";
|
---|
| 1446 | if (defined $hdrs) {
|
---|
| 1447 | if (!ref $hdrs) {
|
---|
| 1448 | if ($hdrs eq "skip") {
|
---|
| 1449 | $csv->getline ($fh); # discard;
|
---|
| 1450 | }
|
---|
| 1451 | elsif ($hdrs eq "auto") {
|
---|
| 1452 | my $h = $csv->getline ($fh) or return;
|
---|
| 1453 | $hdrs = [ map { $hdr{$_} || $_ } @$h ];
|
---|
| 1454 | }
|
---|
| 1455 | elsif ($hdrs eq "lc") {
|
---|
| 1456 | my $h = $csv->getline ($fh) or return;
|
---|
| 1457 | $hdrs = [ map { lc ($hdr{$_} || $_) } @$h ];
|
---|
| 1458 | }
|
---|
| 1459 | elsif ($hdrs eq "uc") {
|
---|
| 1460 | my $h = $csv->getline ($fh) or return;
|
---|
| 1461 | $hdrs = [ map { uc ($hdr{$_} || $_) } @$h ];
|
---|
| 1462 | }
|
---|
| 1463 | }
|
---|
| 1464 | elsif (ref $hdrs eq "CODE") {
|
---|
| 1465 | my $h = $csv->getline ($fh) or return;
|
---|
| 1466 | my $cr = $hdrs;
|
---|
| 1467 | $hdrs = [ map { $cr->($hdr{$_} || $_) } @$h ];
|
---|
| 1468 | }
|
---|
| 1469 | $c->{kh} and $hdrs and @{$c->{kh}} = @$hdrs;
|
---|
| 1470 | }
|
---|
| 1471 |
|
---|
| 1472 | if ($c->{fltr}) {
|
---|
| 1473 | my %f = %{$c->{fltr}};
|
---|
| 1474 | # convert headers to index
|
---|
| 1475 | my @hdr;
|
---|
| 1476 | if (ref $hdrs) {
|
---|
| 1477 | @hdr = @{$hdrs};
|
---|
| 1478 | for (0 .. $#hdr) {
|
---|
| 1479 | exists $f{$hdr[$_]} and $f{$_ + 1} = delete $f{$hdr[$_]};
|
---|
| 1480 | }
|
---|
| 1481 | }
|
---|
| 1482 | $csv->callbacks (after_parse => sub {
|
---|
| 1483 | my ($CSV, $ROW) = @_; # lexical sub-variables in caps
|
---|
| 1484 | foreach my $FLD (sort keys %f) {
|
---|
| 1485 | local $_ = $ROW->[$FLD - 1];
|
---|
| 1486 | local %_;
|
---|
| 1487 | @hdr and @_{@hdr} = @$ROW;
|
---|
| 1488 | $f{$FLD}->($CSV, $ROW) or return \"skip";
|
---|
| 1489 | $ROW->[$FLD - 1] = $_;
|
---|
| 1490 | }
|
---|
| 1491 | });
|
---|
| 1492 | }
|
---|
| 1493 |
|
---|
| 1494 | my $frag = $c->{frag};
|
---|
| 1495 | my $ref = ref $hdrs
|
---|
| 1496 | ? # aoh
|
---|
| 1497 | do {
|
---|
| 1498 | my @h = $csv->column_names ($hdrs);
|
---|
| 1499 | my %h; $h{$_}++ for @h;
|
---|
| 1500 | exists $h{""} and croak ($csv->SetDiag (1012));
|
---|
| 1501 | unless (keys %h == @h) {
|
---|
| 1502 | croak ($csv->_SetDiagInfo (1013, join ", " =>
|
---|
| 1503 | map { "$_ ($h{$_})" } grep { $h{$_} > 1 } keys %h));
|
---|
| 1504 | }
|
---|
| 1505 | $frag ? $csv->fragment ($fh, $frag) :
|
---|
| 1506 | $key ? do {
|
---|
| 1507 | my ($k, $j, @f) = ref $key ? (undef, @$key) : ($key);
|
---|
| 1508 | if (my @mk = grep { !exists $h{$_} } grep { defined } $k, @f) {
|
---|
| 1509 | croak ($csv->_SetDiagInfo (4001, join ", " => @mk));
|
---|
| 1510 | }
|
---|
| 1511 | +{ map {
|
---|
| 1512 | my $r = $_;
|
---|
| 1513 | my $K = defined $k ? $r->{$k} : join $j => @{$r}{@f};
|
---|
| 1514 | ( $K => (
|
---|
| 1515 | $val
|
---|
| 1516 | ? ref $val
|
---|
| 1517 | ? { map { $_ => $r->{$_} } @$val }
|
---|
| 1518 | : $r->{$val}
|
---|
| 1519 | : $r ));
|
---|
| 1520 | } @{$csv->getline_hr_all ($fh)} }
|
---|
| 1521 | }
|
---|
| 1522 | : $csv->getline_hr_all ($fh);
|
---|
| 1523 | }
|
---|
| 1524 | : # aoa
|
---|
| 1525 | $frag ? $csv->fragment ($fh, $frag)
|
---|
| 1526 | : $csv->getline_all ($fh);
|
---|
| 1527 | if ($ref) {
|
---|
| 1528 | @row1 && !$c->{hd_c} && !ref $hdrs and unshift @$ref, \@row1;
|
---|
| 1529 | }
|
---|
| 1530 | else {
|
---|
| 1531 | Text::CSV_PP->auto_diag;
|
---|
| 1532 | }
|
---|
| 1533 | $c->{cls} and close $fh;
|
---|
| 1534 | if ($ref and $c->{cbai} || $c->{cboi}) {
|
---|
| 1535 | # Default is ARRAYref, but with key =>, you'll get a hashref
|
---|
| 1536 | foreach my $r (ref $ref eq "ARRAY" ? @{$ref} : values %{$ref}) {
|
---|
| 1537 | local %_;
|
---|
| 1538 | ref $r eq "HASH" and *_ = $r;
|
---|
| 1539 | $c->{cbai} and $c->{cbai}->($csv, $r);
|
---|
| 1540 | $c->{cboi} and $c->{cboi}->($csv, $r);
|
---|
| 1541 | }
|
---|
| 1542 | }
|
---|
| 1543 |
|
---|
| 1544 | $c->{sink} and return;
|
---|
| 1545 |
|
---|
| 1546 | defined wantarray or
|
---|
| 1547 | return csv (%{$c->{attr}}, in => $ref, headers => $hdrs, %{$c->{attr}});
|
---|
| 1548 |
|
---|
| 1549 | return $ref;
|
---|
| 1550 | }
|
---|
| 1551 |
|
---|
| 1552 | # The end of the common pure perl part.
|
---|
| 1553 |
|
---|
| 1554 | ################################################################################
|
---|
| 1555 | #
|
---|
| 1556 | # The following are methods implemented in XS in Text::CSV_XS or
|
---|
| 1557 | # helper methods for Text::CSV_PP only
|
---|
| 1558 | #
|
---|
| 1559 | ################################################################################
|
---|
| 1560 |
|
---|
| 1561 | sub _setup_ctx {
|
---|
| 1562 | my $self = shift;
|
---|
| 1563 |
|
---|
| 1564 | $last_error = undef;
|
---|
| 1565 |
|
---|
| 1566 | my $ctx;
|
---|
| 1567 | if ($self->{_CACHE}) {
|
---|
| 1568 | %$ctx = %{$self->{_CACHE}};
|
---|
| 1569 | } else {
|
---|
| 1570 | $ctx->{sep} = ',';
|
---|
| 1571 | if (defined $self->{sep_char}) {
|
---|
| 1572 | $ctx->{sep} = $self->{sep_char};
|
---|
| 1573 | }
|
---|
| 1574 | if (defined $self->{sep} and $self->{sep} ne '') {
|
---|
| 1575 | use bytes;
|
---|
| 1576 | $ctx->{sep} = $self->{sep};
|
---|
| 1577 | my $sep_len = length($ctx->{sep});
|
---|
| 1578 | $ctx->{sep_len} = $sep_len if $sep_len > 1;
|
---|
| 1579 | }
|
---|
| 1580 |
|
---|
| 1581 | $ctx->{quo} = '"';
|
---|
| 1582 | if (exists $self->{quote_char}) {
|
---|
| 1583 | my $quote_char = $self->{quote_char};
|
---|
| 1584 | if (defined $quote_char and length $quote_char) {
|
---|
| 1585 | $ctx->{quo} = $quote_char;
|
---|
| 1586 | } else {
|
---|
| 1587 | $ctx->{quo} = "\0";
|
---|
| 1588 | }
|
---|
| 1589 | }
|
---|
| 1590 | if (defined $self->{quote} and $self->{quote} ne '') {
|
---|
| 1591 | use bytes;
|
---|
| 1592 | $ctx->{quo} = $self->{quote};
|
---|
| 1593 | my $quote_len = length($ctx->{quo});
|
---|
| 1594 | $ctx->{quo_len} = $quote_len if $quote_len > 1;
|
---|
| 1595 | }
|
---|
| 1596 |
|
---|
| 1597 | $ctx->{escape_char} = '"';
|
---|
| 1598 | if (exists $self->{escape_char}) {
|
---|
| 1599 | my $escape_char = $self->{escape_char};
|
---|
| 1600 | if (defined $escape_char and length $escape_char) {
|
---|
| 1601 | $ctx->{escape_char} = $escape_char;
|
---|
| 1602 | } else {
|
---|
| 1603 | $ctx->{escape_char} = "\0";
|
---|
| 1604 | }
|
---|
| 1605 | }
|
---|
| 1606 |
|
---|
| 1607 | if (defined $self->{eol}) {
|
---|
| 1608 | my $eol = $self->{eol};
|
---|
| 1609 | my $eol_len = length($eol);
|
---|
| 1610 | $ctx->{eol} = $eol;
|
---|
| 1611 | $ctx->{eol_len} = $eol_len;
|
---|
| 1612 | if ($eol_len == 1 and $eol eq "\015") {
|
---|
| 1613 | $ctx->{eol_is_cr} = 1;
|
---|
| 1614 | }
|
---|
| 1615 | }
|
---|
| 1616 |
|
---|
| 1617 | $ctx->{undef_flg} = 0;
|
---|
| 1618 | if (defined $self->{undef_str}) {
|
---|
| 1619 | $ctx->{undef_str} = $self->{undef_str};
|
---|
| 1620 | $ctx->{undef_flg} = 3 if utf8::is_utf8($self->{undef_str});
|
---|
| 1621 | } else {
|
---|
| 1622 | $ctx->{undef_str} = undef;
|
---|
| 1623 | }
|
---|
| 1624 |
|
---|
| 1625 | if (defined $self->{_types}) {
|
---|
| 1626 | $ctx->{types} = $self->{_types};
|
---|
| 1627 | $ctx->{types_len} = length($ctx->{types});
|
---|
| 1628 | }
|
---|
| 1629 |
|
---|
| 1630 | if (defined $self->{_is_bound}) {
|
---|
| 1631 | $ctx->{is_bound} = $self->{_is_bound};
|
---|
| 1632 | }
|
---|
| 1633 |
|
---|
| 1634 | if (defined $self->{callbacks}) {
|
---|
| 1635 | my $cb = $self->{callbacks};
|
---|
| 1636 | $ctx->{has_hooks} = 0;
|
---|
| 1637 | if (defined $cb->{after_parse} and ref $cb->{after_parse} eq 'CODE') {
|
---|
| 1638 | $ctx->{has_hooks} |= HOOK_AFTER_PARSE;
|
---|
| 1639 | }
|
---|
| 1640 | if (defined $cb->{before_print} and ref $cb->{before_print} eq 'CODE') {
|
---|
| 1641 | $ctx->{has_hooks} |= HOOK_BEFORE_PRINT;
|
---|
| 1642 | }
|
---|
| 1643 | }
|
---|
| 1644 |
|
---|
| 1645 | for (qw/
|
---|
| 1646 | binary decode_utf8 always_quote strict quote_empty
|
---|
| 1647 | allow_loose_quotes allow_loose_escapes
|
---|
| 1648 | allow_unquoted_escape allow_whitespace blank_is_undef
|
---|
| 1649 | empty_is_undef verbatim auto_diag diag_verbose
|
---|
| 1650 | keep_meta_info formula
|
---|
| 1651 | /) {
|
---|
| 1652 | $ctx->{$_} = defined $self->{$_} ? $self->{$_} : 0;
|
---|
| 1653 | }
|
---|
| 1654 | for (qw/quote_space escape_null quote_binary/) {
|
---|
| 1655 | $ctx->{$_} = defined $self->{$_} ? $self->{$_} : 1;
|
---|
| 1656 | }
|
---|
| 1657 | if ($ctx->{escape_char} eq "\0") {
|
---|
| 1658 | $ctx->{escape_null} = 0;
|
---|
| 1659 | }
|
---|
| 1660 |
|
---|
| 1661 | # FIXME: readonly
|
---|
| 1662 | %{$self->{_CACHE}} = %$ctx;
|
---|
| 1663 | }
|
---|
| 1664 |
|
---|
| 1665 | $ctx->{utf8} = 0;
|
---|
| 1666 | $ctx->{size} = 0;
|
---|
| 1667 | $ctx->{used} = 0;
|
---|
| 1668 |
|
---|
| 1669 | if ($ctx->{is_bound}) {
|
---|
| 1670 | my $bound = $self->{_BOUND_COLUMNS};
|
---|
| 1671 | if ($bound and ref $bound eq 'ARRAY') {
|
---|
| 1672 | $ctx->{bound} = $bound;
|
---|
| 1673 | } else {
|
---|
| 1674 | $ctx->{is_bound} = 0;
|
---|
| 1675 | }
|
---|
| 1676 | }
|
---|
| 1677 |
|
---|
| 1678 | $ctx->{eol_pos} = -1;
|
---|
| 1679 | $ctx->{eolx} = $ctx->{eol_len}
|
---|
| 1680 | ? $ctx->{verbatim} || $ctx->{eol_len} >= 2
|
---|
| 1681 | ? 1
|
---|
| 1682 | : $ctx->{eol} =~ /\A[\015|\012]/ ? 0 : 1
|
---|
| 1683 | : 0;
|
---|
| 1684 |
|
---|
| 1685 | if ($ctx->{sep_len} and $ctx->{sep_len} > 1 and _is_valid_utf8($ctx->{sep})) {
|
---|
| 1686 | $ctx->{utf8} = 1;
|
---|
| 1687 | }
|
---|
| 1688 | if ($ctx->{quo_len} and $ctx->{quo_len} > 1 and _is_valid_utf8($ctx->{quo})) {
|
---|
| 1689 | $ctx->{utf8} = 1;
|
---|
| 1690 | }
|
---|
| 1691 |
|
---|
| 1692 | $ctx;
|
---|
| 1693 | }
|
---|
| 1694 |
|
---|
| 1695 | sub _cache_set {
|
---|
| 1696 | my ($self, $idx, $value) = @_;
|
---|
| 1697 | return unless exists $self->{_CACHE};
|
---|
| 1698 | my $cache = $self->{_CACHE};
|
---|
| 1699 |
|
---|
| 1700 | my $key = $_reverse_cache_id{$idx};
|
---|
| 1701 | if (!defined $key) {
|
---|
| 1702 | warn (sprintf "Unknown cache index %d ignored\n", $idx);
|
---|
| 1703 | } elsif ($key eq 'sep_char') {
|
---|
| 1704 | $cache->{sep} = $value;
|
---|
| 1705 | $cache->{sep_len} = 0;
|
---|
| 1706 | }
|
---|
| 1707 | elsif ($key eq 'quote_char') {
|
---|
| 1708 | $cache->{quo} = $value;
|
---|
| 1709 | $cache->{quo_len} = 0;
|
---|
| 1710 | }
|
---|
| 1711 | elsif ($key eq '_has_ahead') {
|
---|
| 1712 | $cache->{has_ahead} = $value;
|
---|
| 1713 | }
|
---|
| 1714 | elsif ($key eq '_has_hooks') {
|
---|
| 1715 | $cache->{has_hooks} = $value;
|
---|
| 1716 | }
|
---|
| 1717 | elsif ($key eq '_is_bound') {
|
---|
| 1718 | $cache->{is_bound} = $value;
|
---|
| 1719 | }
|
---|
| 1720 | elsif ($key eq 'sep') {
|
---|
| 1721 | use bytes;
|
---|
| 1722 | my $len = bytes::length($value);
|
---|
| 1723 | $cache->{sep} = $value if $len;
|
---|
| 1724 | $cache->{sep_len} = $len == 1 ? 0 : $len;
|
---|
| 1725 | }
|
---|
| 1726 | elsif ($key eq 'quote') {
|
---|
| 1727 | use bytes;
|
---|
| 1728 | my $len = bytes::length($value);
|
---|
| 1729 | $cache->{quo} = $value if $len;
|
---|
| 1730 | $cache->{quo_len} = $len == 1 ? 0 : $len;
|
---|
| 1731 | }
|
---|
| 1732 | elsif ($key eq 'eol') {
|
---|
| 1733 | $cache->{eol} = $value if defined($value);
|
---|
| 1734 | $cache->{eol_is_cr} = $value eq "\015" ? 1 : 0;
|
---|
| 1735 | }
|
---|
| 1736 | elsif ($key eq 'undef_str') {
|
---|
| 1737 | if (defined $value) {
|
---|
| 1738 | $cache->{undef_str} = $value;
|
---|
| 1739 | $cache->{undef_flg} = 3 if utf8::is_utf8($value);
|
---|
| 1740 | } else {
|
---|
| 1741 | $cache->{undef_str} = undef;
|
---|
| 1742 | $cache->{undef_flg} = 0;
|
---|
| 1743 | }
|
---|
| 1744 | }
|
---|
| 1745 | else {
|
---|
| 1746 | $cache->{$key} = $value;
|
---|
| 1747 | }
|
---|
| 1748 | return 1;
|
---|
| 1749 | }
|
---|
| 1750 |
|
---|
| 1751 | sub _cache_diag {
|
---|
| 1752 | my $self = shift;
|
---|
| 1753 | unless (exists $self->{_CACHE}) {
|
---|
| 1754 | warn ("CACHE: invalid\n");
|
---|
| 1755 | return;
|
---|
| 1756 | }
|
---|
| 1757 |
|
---|
| 1758 | my $cache = $self->{_CACHE};
|
---|
| 1759 | warn ("CACHE:\n");
|
---|
| 1760 | $self->__cache_show_char(quote_char => $cache->{quo});
|
---|
| 1761 | $self->__cache_show_char(escape_char => $cache->{escape_char});
|
---|
| 1762 | $self->__cache_show_char(sep_char => $cache->{sep});
|
---|
| 1763 | for (qw/
|
---|
| 1764 | binary decode_utf8 allow_loose_escapes allow_loose_quotes allow_unquoted_escape
|
---|
| 1765 | allow_whitespace always_quote quote_empty quote_space
|
---|
| 1766 | escape_null quote_binary auto_diag diag_verbose formula strict
|
---|
| 1767 | has_error_input blank_is_undef empty_is_undef has_ahead
|
---|
| 1768 | keep_meta_info verbatim has_hooks eol_is_cr eol_len
|
---|
| 1769 | /) {
|
---|
| 1770 | $self->__cache_show_byte($_ => $cache->{$_});
|
---|
| 1771 | }
|
---|
| 1772 | $self->__cache_show_str(eol => $cache->{eol_len}, $cache->{eol});
|
---|
| 1773 | $self->__cache_show_byte(sep_len => $cache->{sep_len});
|
---|
| 1774 | if ($cache->{sep_len} and $cache->{sep_len} > 1) {
|
---|
| 1775 | $self->__cache_show_str(sep => $cache->{sep_len}, $cache->{sep});
|
---|
| 1776 | }
|
---|
| 1777 | $self->__cache_show_byte(quo_len => $cache->{quo_len});
|
---|
| 1778 | if ($cache->{quo_len} and $cache->{quo_len} > 1) {
|
---|
| 1779 | $self->__cache_show_str(quote => $cache->{quo_len}, $cache->{quo});
|
---|
| 1780 | }
|
---|
| 1781 | }
|
---|
| 1782 |
|
---|
| 1783 | sub __cache_show_byte {
|
---|
| 1784 | my ($self, $key, $value) = @_;
|
---|
| 1785 | warn (sprintf " %-21s %02x:%3d\n", $key, defined $value ? ord($value) : 0, defined $value ? $value : 0);
|
---|
| 1786 | }
|
---|
| 1787 |
|
---|
| 1788 | sub __cache_show_char {
|
---|
| 1789 | my ($self, $key, $value) = @_;
|
---|
| 1790 | my $v = $value;
|
---|
| 1791 | if (defined $value) {
|
---|
| 1792 | my @b = unpack "U0C*", $value;
|
---|
| 1793 | $v = pack "U*", $b[0];
|
---|
| 1794 | }
|
---|
| 1795 | warn (sprintf " %-21s %02x:%s\n", $key, defined $v ? ord($v) : 0, $self->__pretty_str($v, 1));
|
---|
| 1796 | }
|
---|
| 1797 |
|
---|
| 1798 | sub __cache_show_str {
|
---|
| 1799 | my ($self, $key, $len, $value) = @_;
|
---|
| 1800 | warn (sprintf " %-21s %02d:%s\n", $key, $len, $self->__pretty_str($value, $len));
|
---|
| 1801 | }
|
---|
| 1802 |
|
---|
| 1803 | sub __pretty_str { # FIXME
|
---|
| 1804 | my ($self, $str, $len) = @_;
|
---|
| 1805 | return '' unless defined $str;
|
---|
| 1806 | $str = substr($str, 0, $len);
|
---|
| 1807 | $str =~ s/"/\\"/g;
|
---|
| 1808 | $str =~ s/([^\x09\x20-\x7e])/sprintf '\\x{%x}', ord($1)/eg;
|
---|
| 1809 | qq{"$str"};
|
---|
| 1810 | }
|
---|
| 1811 |
|
---|
| 1812 | sub _hook {
|
---|
| 1813 | my ($self, $name, $fields) = @_;
|
---|
| 1814 | return 0 unless $self->{callbacks};
|
---|
| 1815 |
|
---|
| 1816 | my $cb = $self->{callbacks}{$name};
|
---|
| 1817 | return 0 unless $cb && ref $cb eq 'CODE';
|
---|
| 1818 |
|
---|
| 1819 | my (@res) = $cb->($self, $fields);
|
---|
| 1820 | if (@res) {
|
---|
| 1821 | return 0 if ref $res[0] eq 'SCALAR' and ${$res[0]} eq "skip";
|
---|
| 1822 | }
|
---|
| 1823 | scalar @res;
|
---|
| 1824 | }
|
---|
| 1825 |
|
---|
| 1826 | ################################################################################
|
---|
| 1827 | # methods for combine
|
---|
| 1828 | ################################################################################
|
---|
| 1829 |
|
---|
| 1830 | sub __combine {
|
---|
| 1831 | my ($self, $dst, $fields, $useIO) = @_;
|
---|
| 1832 |
|
---|
| 1833 | my $ctx = $self->_setup_ctx;
|
---|
| 1834 |
|
---|
| 1835 | my ($binary, $quot, $sep, $esc, $quote_space) = @{$ctx}{qw/binary quo sep escape_char quote_space/};
|
---|
| 1836 |
|
---|
| 1837 | if(!defined $quot or $quot eq "\0"){ $quot = ''; }
|
---|
| 1838 |
|
---|
| 1839 | my $re_esc;
|
---|
| 1840 | if ($esc ne '' and $esc ne "\0") {
|
---|
| 1841 | if ($quot ne '') {
|
---|
| 1842 | $re_esc = $self->{_re_comb_escape}->{$quot}->{$esc} ||= qr/(\Q$quot\E|\Q$esc\E)/;
|
---|
| 1843 | } else {
|
---|
| 1844 | $re_esc = $self->{_re_comb_escape}->{$quot}->{$esc} ||= qr/(\Q$esc\E)/;
|
---|
| 1845 | }
|
---|
| 1846 | }
|
---|
| 1847 |
|
---|
| 1848 | my $bound = 0;
|
---|
| 1849 | my $n = @$fields - 1;
|
---|
| 1850 | if ($n < 0 and $ctx->{is_bound}) {
|
---|
| 1851 | $n = $ctx->{is_bound} - 1;
|
---|
| 1852 | $bound = 1;
|
---|
| 1853 | }
|
---|
| 1854 |
|
---|
| 1855 | my $check_meta = ($ctx->{keep_meta_info} >= 10 and @{$self->{_FFLAGS} || []} >= $n) ? 1 : 0;
|
---|
| 1856 |
|
---|
| 1857 | my $must_be_quoted;
|
---|
| 1858 | my @results;
|
---|
| 1859 | for(my $i = 0; $i <= $n; $i++) {
|
---|
| 1860 | my $v_ref;
|
---|
| 1861 | if ($bound) {
|
---|
| 1862 | $v_ref = $self->__bound_field($ctx, $i, 1);
|
---|
| 1863 | } else {
|
---|
| 1864 | if (@$fields > $i) {
|
---|
| 1865 | $v_ref = \($fields->[$i]);
|
---|
| 1866 | }
|
---|
| 1867 | }
|
---|
| 1868 | next unless $v_ref;
|
---|
| 1869 |
|
---|
| 1870 | my $value = $$v_ref;
|
---|
| 1871 |
|
---|
| 1872 | if (!defined $value) {
|
---|
| 1873 | if ($ctx->{undef_str}) {
|
---|
| 1874 | if ($ctx->{undef_flg}) {
|
---|
| 1875 | $ctx->{utf8} = 1;
|
---|
| 1876 | $ctx->{binary} = 1;
|
---|
| 1877 | }
|
---|
| 1878 | push @results, $ctx->{undef_str};
|
---|
| 1879 | } else {
|
---|
| 1880 | push @results, '';
|
---|
| 1881 | }
|
---|
| 1882 | next;
|
---|
| 1883 | }
|
---|
| 1884 |
|
---|
| 1885 | if ( substr($value, 0, 1) eq '=' && $ctx->{formula} ) {
|
---|
| 1886 | $value = $self->_formula($ctx, $value, $i);
|
---|
| 1887 | if (!defined $value) {
|
---|
| 1888 | push @results, '';
|
---|
| 1889 | next;
|
---|
| 1890 | }
|
---|
| 1891 | }
|
---|
| 1892 |
|
---|
| 1893 | $must_be_quoted = $ctx->{always_quote} ? 1 : 0;
|
---|
| 1894 | if ($value eq '') {
|
---|
| 1895 | $must_be_quoted++ if $ctx->{quote_empty} or ($check_meta && $self->is_quoted($i));
|
---|
| 1896 | }
|
---|
| 1897 | else {
|
---|
| 1898 |
|
---|
| 1899 | if (utf8::is_utf8 $value) {
|
---|
| 1900 | $ctx->{utf8} = 1;
|
---|
| 1901 | $ctx->{binary} = 1;
|
---|
| 1902 | }
|
---|
| 1903 |
|
---|
| 1904 | $must_be_quoted++ if $check_meta && $self->is_quoted($i);
|
---|
| 1905 |
|
---|
| 1906 | if (!$must_be_quoted and $quot ne '') {
|
---|
| 1907 | use bytes;
|
---|
| 1908 | $must_be_quoted++ if
|
---|
| 1909 | ($value =~ /\Q$quot\E/) ||
|
---|
| 1910 | ($sep ne '' and $sep ne "\0" and $value =~ /\Q$sep\E/) ||
|
---|
| 1911 | ($esc ne '' and $esc ne "\0" and $value =~ /\Q$esc\E/) ||
|
---|
| 1912 | ($ctx->{quote_binary} && $value =~ /[\x00-\x1f\x7f-\xa0]/) ||
|
---|
| 1913 | ($ctx->{quote_space} && $value =~ /[\x09\x20]/);
|
---|
| 1914 | }
|
---|
| 1915 |
|
---|
| 1916 | if (!$ctx->{binary} and $value =~ /[^\x09\x20-\x7E]/) {
|
---|
| 1917 | # an argument contained an invalid character...
|
---|
| 1918 | $self->{_ERROR_INPUT} = $value;
|
---|
| 1919 | $self->SetDiag(2110);
|
---|
| 1920 | return 0;
|
---|
| 1921 | }
|
---|
| 1922 |
|
---|
| 1923 | if ($re_esc) {
|
---|
| 1924 | $value =~ s/($re_esc)/$esc$1/g;
|
---|
| 1925 | }
|
---|
| 1926 | if ($ctx->{escape_null}) {
|
---|
| 1927 | $value =~ s/\0/${esc}0/g;
|
---|
| 1928 | }
|
---|
| 1929 | }
|
---|
| 1930 |
|
---|
| 1931 | if ($must_be_quoted) {
|
---|
| 1932 | $value = $quot . $value . $quot;
|
---|
| 1933 | }
|
---|
| 1934 | push @results, $value;
|
---|
| 1935 | }
|
---|
| 1936 |
|
---|
| 1937 | $$dst = join($sep, @results) . ( defined $ctx->{eol} ? $ctx->{eol} : '' );
|
---|
| 1938 |
|
---|
| 1939 | return 1;
|
---|
| 1940 | }
|
---|
| 1941 |
|
---|
| 1942 | sub _formula {
|
---|
| 1943 | my ($self, $ctx, $value, $i) = @_;
|
---|
| 1944 |
|
---|
| 1945 | my $fa = $ctx->{formula} or return;
|
---|
| 1946 | if ($fa == 1) { die "Formulas are forbidden\n" }
|
---|
| 1947 | if ($fa == 2) { die "Formulas are forbidden\n" } # XS croak behaves like PP's "die"
|
---|
| 1948 |
|
---|
| 1949 | if ($fa == 3) {
|
---|
| 1950 | my $rec = '';
|
---|
| 1951 | if ($ctx->{recno}) {
|
---|
| 1952 | $rec = sprintf " in record %lu", $ctx->{recno} + 1;
|
---|
| 1953 | }
|
---|
| 1954 | my $field = '';
|
---|
| 1955 | my $column_names = $self->{_COLUMN_NAMES};
|
---|
| 1956 | if (ref $column_names eq 'ARRAY' and @$column_names >= $i - 1) {
|
---|
| 1957 | my $column_name = $column_names->[$i - 1];
|
---|
| 1958 | $field = sprintf " (column: '%.100s')", $column_name if defined $column_name;
|
---|
| 1959 | }
|
---|
| 1960 | warn sprintf("Field %d%s%s contains formula '%s'\n", $i, $field, $rec, $value);
|
---|
| 1961 | return $value;
|
---|
| 1962 | }
|
---|
| 1963 |
|
---|
| 1964 | if ($fa == 4) {
|
---|
| 1965 | return '';
|
---|
| 1966 | }
|
---|
| 1967 | if ($fa == 5) {
|
---|
| 1968 | return undef;
|
---|
| 1969 | }
|
---|
| 1970 | return;
|
---|
| 1971 | }
|
---|
| 1972 |
|
---|
| 1973 | sub print {
|
---|
| 1974 | my ($self, $io, $fields) = @_;
|
---|
| 1975 |
|
---|
| 1976 | require IO::Handle;
|
---|
| 1977 |
|
---|
| 1978 | if (!defined $fields) {
|
---|
| 1979 | $fields = [];
|
---|
| 1980 | } elsif(ref($fields) ne 'ARRAY'){
|
---|
| 1981 | Carp::croak("Expected fields to be an array ref");
|
---|
| 1982 | }
|
---|
| 1983 |
|
---|
| 1984 | $self->_hook(before_print => $fields);
|
---|
| 1985 |
|
---|
| 1986 | my $str = "";
|
---|
| 1987 | $self->__combine(\$str, $fields, 1) or return '';
|
---|
| 1988 |
|
---|
| 1989 | local $\ = '';
|
---|
| 1990 |
|
---|
| 1991 | $io->print( $str ) or $self->_set_error_diag(2200);
|
---|
| 1992 | }
|
---|
| 1993 |
|
---|
| 1994 | ################################################################################
|
---|
| 1995 | # methods for parse
|
---|
| 1996 | ################################################################################
|
---|
| 1997 |
|
---|
| 1998 |
|
---|
| 1999 | sub __parse { # cx_xsParse
|
---|
| 2000 | my ($self, $fields, $fflags, $src, $useIO) = @_;
|
---|
| 2001 |
|
---|
| 2002 | my $ctx = $self->_setup_ctx;
|
---|
| 2003 |
|
---|
| 2004 | my $state = $self->___parse($ctx, $fields, $fflags, $src, $useIO);
|
---|
| 2005 | if ($state and ($ctx->{has_hooks} || 0) & HOOK_AFTER_PARSE) {
|
---|
| 2006 | $self->_hook(after_parse => $fields);
|
---|
| 2007 | }
|
---|
| 2008 | return $state || !$last_error;
|
---|
| 2009 | }
|
---|
| 2010 |
|
---|
| 2011 | sub ___parse { # cx_c_xsParse
|
---|
| 2012 | my ($self, $ctx, $fields, $fflags, $src, $useIO) = @_;
|
---|
| 2013 |
|
---|
| 2014 | local $/ = $ctx->{eol} if $ctx->{eolx} or $ctx->{eol_is_cr};
|
---|
| 2015 |
|
---|
| 2016 | if ($ctx->{useIO} = $useIO) {
|
---|
| 2017 | require IO::Handle;
|
---|
| 2018 |
|
---|
| 2019 | $ctx->{tmp} = undef;
|
---|
| 2020 | if ($ctx->{has_ahead} and defined $self->{_AHEAD}) {
|
---|
| 2021 | $ctx->{tmp} = $self->{_AHEAD};
|
---|
| 2022 | $ctx->{size} = length $ctx->{tmp};
|
---|
| 2023 | $ctx->{used} = 0;
|
---|
| 2024 | }
|
---|
| 2025 | } else {
|
---|
| 2026 | $ctx->{tmp} = $src;
|
---|
| 2027 | $ctx->{size} = length $src;
|
---|
| 2028 | $ctx->{used} = 0;
|
---|
| 2029 | $ctx->{utf8} = utf8::is_utf8($src);
|
---|
| 2030 | }
|
---|
| 2031 | if ($ctx->{has_error_input}) {
|
---|
| 2032 | $self->{_ERROR_INPUT} = undef;
|
---|
| 2033 | $ctx->{has_error_input} = 0;
|
---|
| 2034 | }
|
---|
| 2035 |
|
---|
| 2036 | my $result = $self->____parse($ctx, $src, $fields, $fflags);
|
---|
| 2037 | $self->{_RECNO} = ++($ctx->{recno});
|
---|
| 2038 | $self->{_EOF} = '';
|
---|
| 2039 |
|
---|
| 2040 | if ($ctx->{strict}) {
|
---|
| 2041 | $ctx->{strict_n} ||= $ctx->{fld_idx};
|
---|
| 2042 | if ($ctx->{strict_n} != $ctx->{fld_idx}) {
|
---|
| 2043 | $self->__parse_error($ctx, 2014, $ctx->{used});
|
---|
| 2044 | return;
|
---|
| 2045 | }
|
---|
| 2046 | }
|
---|
| 2047 |
|
---|
| 2048 | if ($ctx->{useIO}) {
|
---|
| 2049 | if (defined $ctx->{tmp} and $ctx->{used} < $ctx->{size} and $ctx->{has_ahead}) {
|
---|
| 2050 | $self->{_AHEAD} = substr($ctx->{tmp}, $ctx->{used}, $ctx->{size} - $ctx->{used});
|
---|
| 2051 | } else {
|
---|
| 2052 | $ctx->{has_ahead} = 0;
|
---|
| 2053 | if ($ctx->{useIO} & useIO_EOF) {
|
---|
| 2054 | $self->{_EOF} = 1;
|
---|
| 2055 | }
|
---|
| 2056 | }
|
---|
| 2057 | %{$self->{_CACHE}} = %$ctx;
|
---|
| 2058 |
|
---|
| 2059 | if ($fflags) {
|
---|
| 2060 | if ($ctx->{keep_meta_info}) {
|
---|
| 2061 | $self->{_FFLAGS} = $fflags;
|
---|
| 2062 | } else {
|
---|
| 2063 | undef $fflags;
|
---|
| 2064 | }
|
---|
| 2065 | }
|
---|
| 2066 | } else {
|
---|
| 2067 | %{$self->{_CACHE}} = %$ctx;
|
---|
| 2068 | }
|
---|
| 2069 |
|
---|
| 2070 | if ($result and $ctx->{types}) {
|
---|
| 2071 | my $len = @$fields;
|
---|
| 2072 | for(my $i = 0; $i <= $len && $i <= $ctx->{types_len}; $i++) {
|
---|
| 2073 | my $value = $fields->[$i];
|
---|
| 2074 | next unless defined $value;
|
---|
| 2075 | my $type = ord(substr($ctx->{types}, $i, 1));
|
---|
| 2076 | if ($type == IV) {
|
---|
| 2077 | $fields->[$i] = int($value);
|
---|
| 2078 | } elsif ($type == NV) {
|
---|
| 2079 | $fields->[$i] = $value + 0.0;
|
---|
| 2080 | }
|
---|
| 2081 | }
|
---|
| 2082 | }
|
---|
| 2083 |
|
---|
| 2084 | $result;
|
---|
| 2085 | }
|
---|
| 2086 |
|
---|
| 2087 | sub ____parse { # cx_Parse
|
---|
| 2088 | my ($self, $ctx, $src, $fields, $fflags) = @_;
|
---|
| 2089 |
|
---|
| 2090 | my ($quot, $sep, $esc, $eol) = @{$ctx}{qw/quo sep escape_char eol/};
|
---|
| 2091 |
|
---|
| 2092 | utf8::encode($sep) if !$ctx->{utf8} and $ctx->{sep_len};
|
---|
| 2093 | utf8::encode($quot) if !$ctx->{utf8} and $ctx->{quo_len};
|
---|
| 2094 | utf8::encode($eol) if !$ctx->{utf8} and $ctx->{eol_len};
|
---|
| 2095 |
|
---|
| 2096 | my $seenSomething = 0;
|
---|
| 2097 | my $waitingForField = 1;
|
---|
| 2098 | my ($value, $v_ref);
|
---|
| 2099 | $ctx->{fld_idx} = my $fnum = 0;
|
---|
| 2100 | $ctx->{flag} = 0;
|
---|
| 2101 |
|
---|
| 2102 | my $re_str = join '|', map({$_ eq "\0" ? '[\\0]' : quotemeta($_)} sort {length $b <=> length $a} grep {defined $_ and $_ ne ''} $sep, $quot, $esc, $eol), "\015", "\012", "\x09", " ";
|
---|
| 2103 | $ctx->{_re} = qr/$re_str/;
|
---|
| 2104 | my $re = qr/$re_str|[^\x09\x20-\x7E]|$/;
|
---|
| 2105 |
|
---|
| 2106 | LOOP:
|
---|
| 2107 | while($self->__get_from_src($ctx, $src)) {
|
---|
| 2108 | while($ctx->{tmp} =~ /\G(.*?)($re)/gs) {
|
---|
| 2109 | my ($hit, $c) = ($1, $2);
|
---|
| 2110 | $ctx->{used} = pos($ctx->{tmp});
|
---|
| 2111 | if (!$waitingForField and $c eq '' and $hit ne '' and $ctx->{useIO} and !($ctx->{useIO} & useIO_EOF)) {
|
---|
| 2112 | $self->{_AHEAD} = $hit;
|
---|
| 2113 | $ctx->{has_ahead} = 1;
|
---|
| 2114 | $ctx->{has_leftover} = 1;
|
---|
| 2115 | last;
|
---|
| 2116 | }
|
---|
| 2117 | last if $seenSomething and $hit eq '' and $c eq ''; # EOF
|
---|
| 2118 |
|
---|
| 2119 | # new field
|
---|
| 2120 | if (!$v_ref) {
|
---|
| 2121 | if ($ctx->{is_bound}) {
|
---|
| 2122 | $v_ref = $self->__bound_field($ctx, $fnum, 0);
|
---|
| 2123 | } else {
|
---|
| 2124 | $value = '';
|
---|
| 2125 | $v_ref = \$value;
|
---|
| 2126 | }
|
---|
| 2127 | $fnum++;
|
---|
| 2128 | return unless $v_ref;
|
---|
| 2129 | $ctx->{flag} = 0;
|
---|
| 2130 | $ctx->{fld_idx}++;
|
---|
| 2131 | }
|
---|
| 2132 |
|
---|
| 2133 | $seenSomething = 1;
|
---|
| 2134 |
|
---|
| 2135 | if (defined $hit and $hit ne '') {
|
---|
| 2136 | if ($waitingForField) {
|
---|
| 2137 | $waitingForField = 0;
|
---|
| 2138 | }
|
---|
| 2139 | if ($hit =~ /[^\x09\x20-\x7E]/) {
|
---|
| 2140 | $ctx->{flag} |= IS_BINARY;
|
---|
| 2141 | }
|
---|
| 2142 | $$v_ref .= $hit;
|
---|
| 2143 | }
|
---|
| 2144 |
|
---|
| 2145 | RESTART:
|
---|
| 2146 | if (defined $c and defined $sep and $c eq $sep) {
|
---|
| 2147 | if ($waitingForField) {
|
---|
| 2148 | # ,1,"foo, 3",,bar,
|
---|
| 2149 | # ^ ^
|
---|
| 2150 | if ($ctx->{blank_is_undef} or $ctx->{empty_is_undef}) {
|
---|
| 2151 | $$v_ref = undef;
|
---|
| 2152 | } else {
|
---|
| 2153 | $$v_ref = "";
|
---|
| 2154 | }
|
---|
| 2155 | unless ($ctx->{is_bound}) {
|
---|
| 2156 | push @$fields, $$v_ref;
|
---|
| 2157 | }
|
---|
| 2158 | $v_ref = undef;
|
---|
| 2159 | if ($ctx->{keep_meta_info} and $fflags) {
|
---|
| 2160 | push @$fflags, $ctx->{flag};
|
---|
| 2161 | }
|
---|
| 2162 | } elsif ($ctx->{flag} & IS_QUOTED) {
|
---|
| 2163 | # ,1,"foo, 3",,bar,
|
---|
| 2164 | # ^
|
---|
| 2165 | $$v_ref .= $c;
|
---|
| 2166 | } else {
|
---|
| 2167 | # ,1,"foo, 3",,bar,
|
---|
| 2168 | # ^ ^ ^
|
---|
| 2169 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2170 | $v_ref = undef;
|
---|
| 2171 | $waitingForField = 1;
|
---|
| 2172 | }
|
---|
| 2173 | }
|
---|
| 2174 | elsif (defined $c and defined $quot and $quot ne "\0" and $c eq $quot) {
|
---|
| 2175 | if ($waitingForField) {
|
---|
| 2176 | # ,1,"foo, 3",,bar,\r\n
|
---|
| 2177 | # ^
|
---|
| 2178 | $ctx->{flag} |= IS_QUOTED;
|
---|
| 2179 | $waitingForField = 0;
|
---|
| 2180 | next;
|
---|
| 2181 | }
|
---|
| 2182 | if ($ctx->{flag} & IS_QUOTED) {
|
---|
| 2183 | # ,1,"foo, 3",,bar,\r\n
|
---|
| 2184 | # ^
|
---|
| 2185 | my $quoesc = 0;
|
---|
| 2186 | my $c2 = $self->__get($ctx);
|
---|
| 2187 |
|
---|
| 2188 | if ($ctx->{allow_whitespace}) {
|
---|
| 2189 | # , 1 , "foo, 3" , , bar , \r\n
|
---|
| 2190 | # ^
|
---|
| 2191 | while($self->__is_whitespace($ctx, $c2)) {
|
---|
| 2192 | if ($ctx->{allow_loose_quotes} and !(defined $esc and $c2 eq $esc)) {
|
---|
| 2193 | $$v_ref .= $c;
|
---|
| 2194 | $c = $c2;
|
---|
| 2195 | }
|
---|
| 2196 | $c2 = $self->__get($ctx);
|
---|
| 2197 | }
|
---|
| 2198 | }
|
---|
| 2199 |
|
---|
| 2200 | if (!defined $c2) { # EOF
|
---|
| 2201 | # ,1,"foo, 3"
|
---|
| 2202 | # ^
|
---|
| 2203 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2204 | return 1;
|
---|
| 2205 | }
|
---|
| 2206 |
|
---|
| 2207 | if (defined $c2 and defined $sep and $c2 eq $sep) {
|
---|
| 2208 | # ,1,"foo, 3",,bar,\r\n
|
---|
| 2209 | # ^
|
---|
| 2210 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2211 | $v_ref = undef;
|
---|
| 2212 | $waitingForField = 1;
|
---|
| 2213 | next;
|
---|
| 2214 | }
|
---|
| 2215 | if (defined $c2 and ($c2 eq "\012" or (defined $eol and $c2 eq $eol))) { # FIXME: EOLX
|
---|
| 2216 | # ,1,"foo, 3",,"bar"\n
|
---|
| 2217 | # ^
|
---|
| 2218 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2219 | return 1;
|
---|
| 2220 | }
|
---|
| 2221 |
|
---|
| 2222 | if (defined $esc and $c eq $esc) {
|
---|
| 2223 | $quoesc = 1;
|
---|
| 2224 | if (defined $c2 and $c2 eq '0') {
|
---|
| 2225 | # ,1,"foo, 3"056",,bar,\r\n
|
---|
| 2226 | # ^
|
---|
| 2227 | $$v_ref .= "\0";
|
---|
| 2228 | next;
|
---|
| 2229 | }
|
---|
| 2230 | if (defined $c2 and defined $quot and $c2 eq $quot) {
|
---|
| 2231 | # ,1,"foo, 3""56",,bar,\r\n
|
---|
| 2232 | # ^
|
---|
| 2233 | if ($ctx->{utf8}) {
|
---|
| 2234 | $ctx->{flag} |= IS_BINARY;
|
---|
| 2235 | }
|
---|
| 2236 | $$v_ref .= $c2;
|
---|
| 2237 | next;
|
---|
| 2238 | }
|
---|
| 2239 | if ($ctx->{allow_loose_escapes} and defined $c2 and $c2 ne "\015") {
|
---|
| 2240 | # ,1,"foo, 3"56",,bar,\r\n
|
---|
| 2241 | # ^
|
---|
| 2242 | $$v_ref .= $c;
|
---|
| 2243 | $c = $c2;
|
---|
| 2244 | goto RESTART;
|
---|
| 2245 | }
|
---|
| 2246 | }
|
---|
| 2247 | if (defined $c2 and $c2 eq "\015") {
|
---|
| 2248 | if ($ctx->{eol_is_cr}) {
|
---|
| 2249 | # ,1,"foo, 3"\r
|
---|
| 2250 | # ^
|
---|
| 2251 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2252 | return 1;
|
---|
| 2253 | }
|
---|
| 2254 |
|
---|
| 2255 | my $c3 = $self->__get($ctx);
|
---|
| 2256 | if (defined $c3 and $c3 eq "\012") {
|
---|
| 2257 | # ,1,"foo, 3"\r\n
|
---|
| 2258 | # ^
|
---|
| 2259 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2260 | return 1;
|
---|
| 2261 | }
|
---|
| 2262 |
|
---|
| 2263 | if ($ctx->{useIO} and !$ctx->{eol_len} and $c3 !~ /[^\x09\x20-\x7E]/) {
|
---|
| 2264 | # ,1,"foo\n 3",,"bar"\r
|
---|
| 2265 | # baz,4
|
---|
| 2266 | # ^
|
---|
| 2267 | $self->__set_eol_is_cr($ctx);
|
---|
| 2268 | $ctx->{used}--;
|
---|
| 2269 | $ctx->{has_ahead} = 1;
|
---|
| 2270 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2271 | return 1;
|
---|
| 2272 | }
|
---|
| 2273 |
|
---|
| 2274 | $self->__parse_error($ctx, $quoesc ? 2023 : 2010, $ctx->{used} - 2);
|
---|
| 2275 | return;
|
---|
| 2276 | }
|
---|
| 2277 |
|
---|
| 2278 | if ($ctx->{allow_loose_quotes} and !$quoesc) {
|
---|
| 2279 | # ,1,"foo, 3"456",,bar,\r\n
|
---|
| 2280 | # ^
|
---|
| 2281 | $$v_ref .= $c;
|
---|
| 2282 | $c = $c2;
|
---|
| 2283 | goto RESTART;
|
---|
| 2284 | }
|
---|
| 2285 | # 1,"foo" ",3
|
---|
| 2286 | # ^
|
---|
| 2287 | if ($quoesc) {
|
---|
| 2288 | $ctx->{used}--;
|
---|
| 2289 | $self->__error_inside_quotes($ctx, 2023);
|
---|
| 2290 | return;
|
---|
| 2291 | }
|
---|
| 2292 | $self->__error_inside_quotes($ctx, 2011);
|
---|
| 2293 | return;
|
---|
| 2294 | }
|
---|
| 2295 | # !waitingForField, !InsideQuotes
|
---|
| 2296 | if ($ctx->{allow_loose_quotes}) { # 1,foo "boo" d'uh,1
|
---|
| 2297 | $ctx->{flag} |= IS_ERROR;
|
---|
| 2298 | $$v_ref .= $c;
|
---|
| 2299 | } else {
|
---|
| 2300 | $self->__error_inside_field($ctx, 2034);
|
---|
| 2301 | return;
|
---|
| 2302 | }
|
---|
| 2303 | }
|
---|
| 2304 | elsif (defined $c and defined $esc and $esc ne "\0" and $c eq $esc) {
|
---|
| 2305 | # This means quote_char != escape_char
|
---|
| 2306 | if ($waitingForField) {
|
---|
| 2307 | $waitingForField = 0;
|
---|
| 2308 | if ($ctx->{allow_unquoted_escape}) {
|
---|
| 2309 | # The escape character is the first character of an
|
---|
| 2310 | # unquoted field
|
---|
| 2311 | # ... get and store next character
|
---|
| 2312 | my $c2 = $self->__get($ctx);
|
---|
| 2313 | $$v_ref = "";
|
---|
| 2314 |
|
---|
| 2315 | if (!defined $c2) { # EOF
|
---|
| 2316 | $ctx->{used}--;
|
---|
| 2317 | $self->__error_inside_field($ctx, 2035);
|
---|
| 2318 | return;
|
---|
| 2319 | }
|
---|
| 2320 | if ($c2 eq '0') {
|
---|
| 2321 | $$v_ref .= "\0";
|
---|
| 2322 | }
|
---|
| 2323 | elsif (
|
---|
| 2324 | (defined $quot and $c2 eq $quot) or
|
---|
| 2325 | (defined $sep and $c2 eq $sep) or
|
---|
| 2326 | (defined $esc and $c2 eq $esc) or
|
---|
| 2327 | $ctx->{allow_loose_escapes}
|
---|
| 2328 | ) {
|
---|
| 2329 | if ($ctx->{utf8}) {
|
---|
| 2330 | $ctx->{flag} |= IS_BINARY;
|
---|
| 2331 | }
|
---|
| 2332 | $$v_ref .= $c2;
|
---|
| 2333 | } else {
|
---|
| 2334 | $self->__parse_inside_quotes($ctx, 2025);
|
---|
| 2335 | return;
|
---|
| 2336 | }
|
---|
| 2337 | }
|
---|
| 2338 | }
|
---|
| 2339 | elsif ($ctx->{flag} & IS_QUOTED) {
|
---|
| 2340 | my $c2 = $self->__get($ctx);
|
---|
| 2341 | if (!defined $c2) { # EOF
|
---|
| 2342 | $ctx->{used}--;
|
---|
| 2343 | $self->__error_inside_quotes($ctx, 2024);
|
---|
| 2344 | return;
|
---|
| 2345 | }
|
---|
| 2346 | if ($c2 eq '0') {
|
---|
| 2347 | $$v_ref .= "\0";
|
---|
| 2348 | }
|
---|
| 2349 | elsif (
|
---|
| 2350 | (defined $quot and $c2 eq $quot) or
|
---|
| 2351 | (defined $sep and $c2 eq $sep) or
|
---|
| 2352 | (defined $esc and $c2 eq $esc) or
|
---|
| 2353 | $ctx->{allow_loose_escapes}
|
---|
| 2354 | ) {
|
---|
| 2355 | if ($ctx->{utf8}) {
|
---|
| 2356 | $ctx->{flag} |= IS_BINARY;
|
---|
| 2357 | }
|
---|
| 2358 | $$v_ref .= $c2;
|
---|
| 2359 | } else {
|
---|
| 2360 | $ctx->{used}--;
|
---|
| 2361 | $self->__error_inside_quotes($ctx, 2025);
|
---|
| 2362 | return;
|
---|
| 2363 | }
|
---|
| 2364 | }
|
---|
| 2365 | elsif ($v_ref) {
|
---|
| 2366 | my $c2 = $self->__get($ctx);
|
---|
| 2367 | if (!defined $c2) { # EOF
|
---|
| 2368 | $ctx->{used}--;
|
---|
| 2369 | $self->__error_inside_field($ctx, 2035);
|
---|
| 2370 | return;
|
---|
| 2371 | }
|
---|
| 2372 | $$v_ref .= $c2;
|
---|
| 2373 | }
|
---|
| 2374 | else {
|
---|
| 2375 | $self->__error_inside_field($ctx, 2036);
|
---|
| 2376 | return;
|
---|
| 2377 | }
|
---|
| 2378 | }
|
---|
| 2379 | elsif (defined $c and ($c eq "\012" or $c eq '' or (defined $eol and $c eq $eol and $eol ne "\015"))) { # EOL
|
---|
| 2380 | EOLX:
|
---|
| 2381 | if ($waitingForField) {
|
---|
| 2382 | # ,1,"foo, 3",,bar,
|
---|
| 2383 | # ^
|
---|
| 2384 | if ($ctx->{blank_is_undef} or $ctx->{empty_is_undef}) {
|
---|
| 2385 | $$v_ref = undef;
|
---|
| 2386 | } else {
|
---|
| 2387 | $$v_ref = "";
|
---|
| 2388 | }
|
---|
| 2389 | unless ($ctx->{is_bound}) {
|
---|
| 2390 | push @$fields, $$v_ref;
|
---|
| 2391 | }
|
---|
| 2392 | if ($ctx->{keep_meta_info} and $fflags) {
|
---|
| 2393 | push @$fflags, $ctx->{flag};
|
---|
| 2394 | }
|
---|
| 2395 | return 1;
|
---|
| 2396 | }
|
---|
| 2397 | if ($ctx->{flag} & IS_QUOTED) {
|
---|
| 2398 | # ,1,"foo\n 3",,bar,
|
---|
| 2399 | # ^
|
---|
| 2400 | $ctx->{flag} |= IS_BINARY;
|
---|
| 2401 | unless ($ctx->{binary}) {
|
---|
| 2402 | $self->__error_inside_quotes($ctx, 2021);
|
---|
| 2403 | return;
|
---|
| 2404 | }
|
---|
| 2405 | $$v_ref .= $c;
|
---|
| 2406 | }
|
---|
| 2407 | elsif ($ctx->{verbatim}) {
|
---|
| 2408 | # ,1,foo\n 3,,bar,
|
---|
| 2409 | # This feature should be deprecated
|
---|
| 2410 | $ctx->{flag} |= IS_BINARY;
|
---|
| 2411 | unless ($ctx->{binary}) {
|
---|
| 2412 | $self->__error_inside_field($ctx, 2030);
|
---|
| 2413 | return;
|
---|
| 2414 | }
|
---|
| 2415 | $$v_ref .= $c unless $ctx->{eol} eq $c and $ctx->{useIO};
|
---|
| 2416 | }
|
---|
| 2417 | else {
|
---|
| 2418 | # sep=,
|
---|
| 2419 | # ^
|
---|
| 2420 | if (!$ctx->{recno} and $ctx->{fld_idx} == 1 and $ctx->{useIO} and $hit =~ /^sep=(.{1,16})$/i) {
|
---|
| 2421 | $ctx->{sep} = $1;
|
---|
| 2422 | use bytes;
|
---|
| 2423 | my $len = length $ctx->{sep};
|
---|
| 2424 | if ($len <= 16) {
|
---|
| 2425 | $ctx->{sep_len} = $len == 1 ? 0 : $len;
|
---|
| 2426 | return $self->____parse($ctx, $src, $fields, $fflags);
|
---|
| 2427 | }
|
---|
| 2428 | }
|
---|
| 2429 |
|
---|
| 2430 | # ,1,"foo\n 3",,bar
|
---|
| 2431 | # ^
|
---|
| 2432 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2433 | return 1;
|
---|
| 2434 | }
|
---|
| 2435 | }
|
---|
| 2436 | elsif (defined $c and $c eq "\015" and !$ctx->{verbatim}) {
|
---|
| 2437 | if ($waitingForField) {
|
---|
| 2438 | $waitingForField = 0;
|
---|
| 2439 | if ($ctx->{eol_is_cr}) {
|
---|
| 2440 | # ,1,"foo\n 3",,bar,\r
|
---|
| 2441 | # ^
|
---|
| 2442 | $c = "\012";
|
---|
| 2443 | goto RESTART;
|
---|
| 2444 | }
|
---|
| 2445 |
|
---|
| 2446 | my $c2 = $self->__get($ctx);
|
---|
| 2447 | if (!defined $c2) { # EOF
|
---|
| 2448 | # ,1,"foo\n 3",,bar,\r
|
---|
| 2449 | # ^
|
---|
| 2450 | $c = undef;
|
---|
| 2451 | goto RESTART;
|
---|
| 2452 | }
|
---|
| 2453 | if ($c2 eq "\012") { # \r is not optional before EOLX!
|
---|
| 2454 | # ,1,"foo\n 3",,bar,\r\n
|
---|
| 2455 | # ^
|
---|
| 2456 | $c = $c2;
|
---|
| 2457 | goto RESTART;
|
---|
| 2458 | }
|
---|
| 2459 |
|
---|
| 2460 | if ($ctx->{useIO} and !$ctx->{eol_len} and $c2 !~ /[^\x09\x20-\x7E]/) {
|
---|
| 2461 | # ,1,"foo\n 3",,bar,\r
|
---|
| 2462 | # baz,4
|
---|
| 2463 | # ^
|
---|
| 2464 | $self->__set_eol_is_cr($ctx);
|
---|
| 2465 | $ctx->{used}--;
|
---|
| 2466 | $ctx->{has_ahead} = 1;
|
---|
| 2467 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2468 | return 1;
|
---|
| 2469 | }
|
---|
| 2470 |
|
---|
| 2471 | # ,1,"foo\n 3",,bar,\r\t
|
---|
| 2472 | # ^
|
---|
| 2473 | $ctx->{used}--;
|
---|
| 2474 | $self->__error_inside_field($ctx, 2031);
|
---|
| 2475 | return;
|
---|
| 2476 | }
|
---|
| 2477 | if ($ctx->{flag} & IS_QUOTED) {
|
---|
| 2478 | # ,1,"foo\r 3",,bar,\r\t
|
---|
| 2479 | # ^
|
---|
| 2480 | $ctx->{flag} |= IS_BINARY;
|
---|
| 2481 | unless ($ctx->{binary}) {
|
---|
| 2482 | $self->__error_inside_quotes($ctx, 2022);
|
---|
| 2483 | return;
|
---|
| 2484 | }
|
---|
| 2485 | $$v_ref .= $c;
|
---|
| 2486 | }
|
---|
| 2487 | else {
|
---|
| 2488 | if ($ctx->{eol_is_cr}) {
|
---|
| 2489 | # ,1,"foo\n 3",,bar\r
|
---|
| 2490 | # ^
|
---|
| 2491 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2492 | return 1;
|
---|
| 2493 | }
|
---|
| 2494 |
|
---|
| 2495 | my $c2 = $self->__get($ctx);
|
---|
| 2496 | if (defined $c2 and $c2 eq "\012") { # \r is not optional before EOLX!
|
---|
| 2497 | # ,1,"foo\n 3",,bar\r\n
|
---|
| 2498 | # ^
|
---|
| 2499 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2500 | return 1;
|
---|
| 2501 | }
|
---|
| 2502 |
|
---|
| 2503 | if ($ctx->{useIO} and !$ctx->{eol_len} and $c2 !~ /[^\x09\x20-\x7E]/) {
|
---|
| 2504 | # ,1,"foo\n 3",,bar\r
|
---|
| 2505 | # baz,4
|
---|
| 2506 | # ^
|
---|
| 2507 | $self->__set_eol_is_cr($ctx);
|
---|
| 2508 | $ctx->{used}--;
|
---|
| 2509 | $ctx->{has_ahead} = 1;
|
---|
| 2510 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2511 | return 1;
|
---|
| 2512 | }
|
---|
| 2513 |
|
---|
| 2514 | # ,1,"foo\n 3",,bar\r\t
|
---|
| 2515 | # ^
|
---|
| 2516 | $self->__error_inside_field($ctx, 2032);
|
---|
| 2517 | return;
|
---|
| 2518 | }
|
---|
| 2519 | }
|
---|
| 2520 | else {
|
---|
| 2521 | if ($ctx->{eolx} and $c eq $eol) {
|
---|
| 2522 | $c = '';
|
---|
| 2523 | goto EOLX;
|
---|
| 2524 | }
|
---|
| 2525 |
|
---|
| 2526 | if ($waitingForField) {
|
---|
| 2527 | if ($ctx->{allow_whitespace} and $self->__is_whitespace($ctx, $c)) {
|
---|
| 2528 | do {
|
---|
| 2529 | $c = $self->__get($ctx);
|
---|
| 2530 | last if !defined $c;
|
---|
| 2531 | } while $self->__is_whitespace($ctx, $c);
|
---|
| 2532 | goto RESTART;
|
---|
| 2533 | }
|
---|
| 2534 | $waitingForField = 0;
|
---|
| 2535 | goto RESTART;
|
---|
| 2536 | }
|
---|
| 2537 | if ($ctx->{flag} & IS_QUOTED) {
|
---|
| 2538 | if (!defined $c or $c =~ /[^\x09\x20-\x7E]/) {
|
---|
| 2539 | $ctx->{flag} |= IS_BINARY;
|
---|
| 2540 | unless ($ctx->{binary} or $ctx->{utf8}) {
|
---|
| 2541 | $self->__error_inside_quotes($ctx, 2026);
|
---|
| 2542 | return;
|
---|
| 2543 | }
|
---|
| 2544 | }
|
---|
| 2545 | $$v_ref .= $c;
|
---|
| 2546 | } else {
|
---|
| 2547 | if (!defined $c or $c =~ /[^\x09\x20-\x7E]/) {
|
---|
| 2548 | $ctx->{flag} |= IS_BINARY;
|
---|
| 2549 | unless ($ctx->{binary} or $ctx->{utf8}) {
|
---|
| 2550 | $self->__error_inside_field($ctx, 2037);
|
---|
| 2551 | return;
|
---|
| 2552 | }
|
---|
| 2553 | }
|
---|
| 2554 | $$v_ref .= $c;
|
---|
| 2555 | }
|
---|
| 2556 | }
|
---|
| 2557 | last LOOP if $ctx->{useIO} and $ctx->{verbatim} and $ctx->{used} == $ctx->{size};
|
---|
| 2558 | }
|
---|
| 2559 | }
|
---|
| 2560 |
|
---|
| 2561 | if ($waitingForField) {
|
---|
| 2562 | if ($seenSomething or !$ctx->{useIO}) {
|
---|
| 2563 | # new field
|
---|
| 2564 | if (!$v_ref) {
|
---|
| 2565 | if ($ctx->{is_bound}) {
|
---|
| 2566 | $v_ref = $self->__bound_field($ctx, $fnum, 0);
|
---|
| 2567 | } else {
|
---|
| 2568 | $value = '';
|
---|
| 2569 | $v_ref = \$value;
|
---|
| 2570 | }
|
---|
| 2571 | $fnum++;
|
---|
| 2572 | return unless $v_ref;
|
---|
| 2573 | $ctx->{flag} = 0;
|
---|
| 2574 | $ctx->{fld_idx}++;
|
---|
| 2575 | }
|
---|
| 2576 | if ($ctx->{blank_is_undef} or $ctx->{empty_is_undef}) {
|
---|
| 2577 | $$v_ref = undef;
|
---|
| 2578 | } else {
|
---|
| 2579 | $$v_ref = "";
|
---|
| 2580 | }
|
---|
| 2581 | unless ($ctx->{is_bound}) {
|
---|
| 2582 | push @$fields, $$v_ref;
|
---|
| 2583 | }
|
---|
| 2584 | if ($ctx->{keep_meta_info} and $fflags) {
|
---|
| 2585 | push @$fflags, $ctx->{flag};
|
---|
| 2586 | }
|
---|
| 2587 | return 1;
|
---|
| 2588 | }
|
---|
| 2589 | $self->SetDiag(2012);
|
---|
| 2590 | return;
|
---|
| 2591 | }
|
---|
| 2592 |
|
---|
| 2593 | if ($ctx->{flag} & IS_QUOTED) {
|
---|
| 2594 | $self->__error_inside_quotes($ctx, 2027);
|
---|
| 2595 | return;
|
---|
| 2596 | }
|
---|
| 2597 |
|
---|
| 2598 | if ($v_ref) {
|
---|
| 2599 | $self->__push_value($ctx, $v_ref, $fields, $fflags, $ctx->{flag}, $fnum);
|
---|
| 2600 | }
|
---|
| 2601 | return 1;
|
---|
| 2602 | }
|
---|
| 2603 |
|
---|
| 2604 | sub __get_from_src {
|
---|
| 2605 | my ($self, $ctx, $src) = @_;
|
---|
| 2606 | return 1 if defined $ctx->{tmp} and $ctx->{used} <= 0;
|
---|
| 2607 | return 1 if $ctx->{used} < $ctx->{size};
|
---|
| 2608 | return unless $ctx->{useIO};
|
---|
| 2609 | my $res = $src->getline;
|
---|
| 2610 | if (defined $res) {
|
---|
| 2611 | if ($ctx->{has_ahead}) {
|
---|
| 2612 | $ctx->{tmp} = $self->{_AHEAD};
|
---|
| 2613 | $ctx->{tmp} .= $ctx->{eol} if $ctx->{eol_len};
|
---|
| 2614 | $ctx->{tmp} .= $res;
|
---|
| 2615 | $ctx->{has_ahead} = 0;
|
---|
| 2616 | } else {
|
---|
| 2617 | $ctx->{tmp} = $res;
|
---|
| 2618 | }
|
---|
| 2619 | if ($ctx->{size} = length $ctx->{tmp}) {
|
---|
| 2620 | $ctx->{used} = -1;
|
---|
| 2621 | $ctx->{utf8} = 1 if utf8::is_utf8($ctx->{tmp});
|
---|
| 2622 | pos($ctx->{tmp}) = 0;
|
---|
| 2623 | return 1;
|
---|
| 2624 | }
|
---|
| 2625 | } elsif (delete $ctx->{has_leftover}) {
|
---|
| 2626 | $ctx->{tmp} = $self->{_AHEAD};
|
---|
| 2627 | $ctx->{has_ahead} = 0;
|
---|
| 2628 | $ctx->{useIO} |= useIO_EOF;
|
---|
| 2629 | if ($ctx->{size} = length $ctx->{tmp}) {
|
---|
| 2630 | $ctx->{used} = -1;
|
---|
| 2631 | $ctx->{utf8} = 1 if utf8::is_utf8($ctx->{tmp});
|
---|
| 2632 | pos($ctx->{tmp}) = 0;
|
---|
| 2633 | return 1;
|
---|
| 2634 | }
|
---|
| 2635 | }
|
---|
| 2636 | $ctx->{tmp} = '' unless defined $ctx->{tmp};
|
---|
| 2637 | $ctx->{useIO} |= useIO_EOF;
|
---|
| 2638 | return;
|
---|
| 2639 | }
|
---|
| 2640 |
|
---|
| 2641 | sub __set_eol_is_cr {
|
---|
| 2642 | my ($self, $ctx) = @_;
|
---|
| 2643 | $ctx->{eol} = "\015";
|
---|
| 2644 | $ctx->{eol_is_cr} = 1;
|
---|
| 2645 | $ctx->{eol_len} = 1;
|
---|
| 2646 | %{$self->{_CACHE}} = %$ctx;
|
---|
| 2647 |
|
---|
| 2648 | $self->{eol} = $ctx->{eol};
|
---|
| 2649 | }
|
---|
| 2650 |
|
---|
| 2651 | sub __bound_field {
|
---|
| 2652 | my ($self, $ctx, $i, $keep) = @_;
|
---|
| 2653 | if ($i >= $ctx->{is_bound}) {
|
---|
| 2654 | $self->SetDiag(3006);
|
---|
| 2655 | return;
|
---|
| 2656 | }
|
---|
| 2657 | if (ref $ctx->{bound} eq 'ARRAY') {
|
---|
| 2658 | my $ref = $ctx->{bound}[$i];
|
---|
| 2659 | if (ref $ref) {
|
---|
| 2660 | if ($keep) {
|
---|
| 2661 | return $ref;
|
---|
| 2662 | }
|
---|
| 2663 | unless (Scalar::Util::readonly($$ref)) {
|
---|
| 2664 | $$ref = "";
|
---|
| 2665 | return $ref;
|
---|
| 2666 | }
|
---|
| 2667 | }
|
---|
| 2668 | }
|
---|
| 2669 | $self->SetDiag(3008);
|
---|
| 2670 | return;
|
---|
| 2671 | }
|
---|
| 2672 |
|
---|
| 2673 | sub __get {
|
---|
| 2674 | my ($self, $ctx) = @_;
|
---|
| 2675 | return unless defined $ctx->{used};
|
---|
| 2676 | return if $ctx->{used} >= $ctx->{size};
|
---|
| 2677 | my $pos = pos($ctx->{tmp});
|
---|
| 2678 | if ($ctx->{tmp} =~ /\G($ctx->{_re}|.)/gs) {
|
---|
| 2679 | my $c = $1;
|
---|
| 2680 | if ($c =~ /[^\x09\x20-\x7e]/) {
|
---|
| 2681 | $ctx->{flag} |= IS_BINARY;
|
---|
| 2682 | }
|
---|
| 2683 | $ctx->{used} = pos($ctx->{tmp});
|
---|
| 2684 | return $c;
|
---|
| 2685 | } else {
|
---|
| 2686 | pos($ctx->{tmp}) = $pos;
|
---|
| 2687 | return;
|
---|
| 2688 | }
|
---|
| 2689 | }
|
---|
| 2690 |
|
---|
| 2691 | sub __error_inside_quotes {
|
---|
| 2692 | my ($self, $ctx, $error) = @_;
|
---|
| 2693 | $self->__parse_error($ctx, $error, $ctx->{used} - 1);
|
---|
| 2694 | }
|
---|
| 2695 |
|
---|
| 2696 | sub __error_inside_field {
|
---|
| 2697 | my ($self, $ctx, $error) = @_;
|
---|
| 2698 | $self->__parse_error($ctx, $error, $ctx->{used} - 1);
|
---|
| 2699 | }
|
---|
| 2700 |
|
---|
| 2701 | sub __parse_error {
|
---|
| 2702 | my ($self, $ctx, $error, $pos) = @_;
|
---|
| 2703 | $self->{_ERROR_POS} = $pos;
|
---|
| 2704 | $self->{_ERROR_FLD} = $ctx->{fld_idx};
|
---|
| 2705 | $self->{_ERROR_INPUT} = $ctx->{tmp} if $ctx->{tmp};
|
---|
| 2706 | $self->SetDiag($error);
|
---|
| 2707 | return;
|
---|
| 2708 | }
|
---|
| 2709 |
|
---|
| 2710 | sub __is_whitespace {
|
---|
| 2711 | my ($self, $ctx, $c) = @_;
|
---|
| 2712 | return unless defined $c;
|
---|
| 2713 | return (
|
---|
| 2714 | (!defined $ctx->{sep} or $c ne $ctx->{sep}) &&
|
---|
| 2715 | (!defined $ctx->{quo} or $c ne $ctx->{quo}) &&
|
---|
| 2716 | (!defined $ctx->{escape_char} or $c ne $ctx->{escape_char}) &&
|
---|
| 2717 | ($c eq " " or $c eq "\t")
|
---|
| 2718 | );
|
---|
| 2719 | }
|
---|
| 2720 |
|
---|
| 2721 | sub __push_value { # AV_PUSH (part of)
|
---|
| 2722 | my ($self, $ctx, $v_ref, $fields, $fflags, $flag, $fnum) = @_;
|
---|
| 2723 | utf8::encode($$v_ref) if $ctx->{utf8};
|
---|
| 2724 | if ($ctx->{formula} && $$v_ref && substr($$v_ref, 0, 1) eq '=') {
|
---|
| 2725 | my $value = $self->_formula($ctx, $$v_ref, $fnum);
|
---|
| 2726 | push @$fields, defined $value ? $value : undef;
|
---|
| 2727 | return;
|
---|
| 2728 | }
|
---|
| 2729 | if (
|
---|
| 2730 | (!defined $$v_ref or $$v_ref eq '') and
|
---|
| 2731 | ($ctx->{empty_is_undef} or (!($flag & IS_QUOTED) and $ctx->{blank_is_undef}))
|
---|
| 2732 | ) {
|
---|
| 2733 | $$v_ref = undef;
|
---|
| 2734 | } else {
|
---|
| 2735 | if ($ctx->{allow_whitespace} && !($flag & IS_QUOTED)) {
|
---|
| 2736 | $$v_ref =~ s/[ \t]+$//;
|
---|
| 2737 | }
|
---|
| 2738 | if ($flag & IS_BINARY and $ctx->{decode_utf8} and ($ctx->{utf8} || _is_valid_utf8($$v_ref))) {
|
---|
| 2739 | utf8::decode($$v_ref);
|
---|
| 2740 | }
|
---|
| 2741 | }
|
---|
| 2742 | unless ($ctx->{is_bound}) {
|
---|
| 2743 | push @$fields, $$v_ref;
|
---|
| 2744 | }
|
---|
| 2745 | if ($ctx->{keep_meta_info} and $fflags) {
|
---|
| 2746 | push @$fflags, $flag;
|
---|
| 2747 | }
|
---|
| 2748 | }
|
---|
| 2749 |
|
---|
| 2750 | sub getline {
|
---|
| 2751 | my ($self, $io) = @_;
|
---|
| 2752 |
|
---|
| 2753 | my (@fields, @fflags);
|
---|
| 2754 | my $res = $self->__parse(\@fields, \@fflags, $io, 1);
|
---|
| 2755 | $res ? \@fields : undef;
|
---|
| 2756 | }
|
---|
| 2757 |
|
---|
| 2758 | sub getline_all {
|
---|
| 2759 | my ( $self, $io, $offset, $len ) = @_;
|
---|
| 2760 |
|
---|
| 2761 | my $ctx = $self->_setup_ctx;
|
---|
| 2762 |
|
---|
| 2763 | my $tail = 0;
|
---|
| 2764 | my $n = 0;
|
---|
| 2765 | $offset ||= 0;
|
---|
| 2766 |
|
---|
| 2767 | if ( $offset < 0 ) {
|
---|
| 2768 | $tail = -$offset;
|
---|
| 2769 | $offset = -1;
|
---|
| 2770 | }
|
---|
| 2771 |
|
---|
| 2772 | my (@row, @list);
|
---|
| 2773 | while ($self->___parse($ctx, \@row, undef, $io, 1)) {
|
---|
| 2774 | $ctx = $self->_setup_ctx;
|
---|
| 2775 |
|
---|
| 2776 | if ($offset > 0) {
|
---|
| 2777 | $offset--;
|
---|
| 2778 | @row = ();
|
---|
| 2779 | next;
|
---|
| 2780 | }
|
---|
| 2781 | if ($n++ >= $tail and $tail) {
|
---|
| 2782 | shift @list;
|
---|
| 2783 | $n--;
|
---|
| 2784 | }
|
---|
| 2785 | if (($ctx->{has_hooks} || 0) & HOOK_AFTER_PARSE) {
|
---|
| 2786 | unless ($self->_hook(after_parse => \@row)) {
|
---|
| 2787 | @row = ();
|
---|
| 2788 | next;
|
---|
| 2789 | }
|
---|
| 2790 | }
|
---|
| 2791 | push @list, [@row];
|
---|
| 2792 | @row = ();
|
---|
| 2793 |
|
---|
| 2794 | last if defined $len && $n >= $len and $offset >= 0; # exceeds limit size
|
---|
| 2795 | }
|
---|
| 2796 |
|
---|
| 2797 | if ( defined $len && $n > $len ) {
|
---|
| 2798 | @list = splice( @list, 0, $len);
|
---|
| 2799 | }
|
---|
| 2800 |
|
---|
| 2801 | return \@list;
|
---|
| 2802 | }
|
---|
| 2803 |
|
---|
| 2804 | sub _is_valid_utf8 {
|
---|
| 2805 | return ( $_[0] =~ /^(?:
|
---|
| 2806 | [\x00-\x7F]
|
---|
| 2807 | |[\xC2-\xDF][\x80-\xBF]
|
---|
| 2808 | |[\xE0][\xA0-\xBF][\x80-\xBF]
|
---|
| 2809 | |[\xE1-\xEC][\x80-\xBF][\x80-\xBF]
|
---|
| 2810 | |[\xED][\x80-\x9F][\x80-\xBF]
|
---|
| 2811 | |[\xEE-\xEF][\x80-\xBF][\x80-\xBF]
|
---|
| 2812 | |[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]
|
---|
| 2813 | |[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]
|
---|
| 2814 | |[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]
|
---|
| 2815 | )+$/x ) ? 1 : 0;
|
---|
| 2816 | }
|
---|
| 2817 |
|
---|
| 2818 | ################################################################################
|
---|
| 2819 | # methods for errors
|
---|
| 2820 | ################################################################################
|
---|
| 2821 |
|
---|
| 2822 | sub _set_error_diag {
|
---|
| 2823 | my ( $self, $error, $pos ) = @_;
|
---|
| 2824 |
|
---|
| 2825 | $self->SetDiag($error);
|
---|
| 2826 |
|
---|
| 2827 | if (defined $pos) {
|
---|
| 2828 | $_[0]->{_ERROR_POS} = $pos;
|
---|
| 2829 | }
|
---|
| 2830 |
|
---|
| 2831 | return;
|
---|
| 2832 | }
|
---|
| 2833 |
|
---|
| 2834 | sub error_input {
|
---|
| 2835 | my $self = shift;
|
---|
| 2836 | if ($self and ((Scalar::Util::reftype($self) || '') eq 'HASH' or (ref $self) =~ /^Text::CSV/)) {
|
---|
| 2837 | return $self->{_ERROR_INPUT};
|
---|
| 2838 | }
|
---|
| 2839 | return;
|
---|
| 2840 | }
|
---|
| 2841 |
|
---|
| 2842 | sub _sv_diag {
|
---|
| 2843 | my ($self, $error) = @_;
|
---|
| 2844 | bless [$error, $ERRORS->{$error}], 'Text::CSV::ErrorDiag';
|
---|
| 2845 | }
|
---|
| 2846 |
|
---|
| 2847 | sub _set_diag {
|
---|
| 2848 | my ($self, $ctx, $error) = @_;
|
---|
| 2849 |
|
---|
| 2850 | $last_error = $self->_sv_diag($error);
|
---|
| 2851 | $self->{_ERROR_DIAG} = $last_error;
|
---|
| 2852 | if ($error == 0) {
|
---|
| 2853 | $self->{_ERROR_POS} = 0;
|
---|
| 2854 | $self->{_ERROR_FLD} = 0;
|
---|
| 2855 | $self->{_ERROR_INPUT} = undef;
|
---|
| 2856 | $ctx->{has_error_input} = 0;
|
---|
| 2857 | }
|
---|
| 2858 | if ($error == 2012) { # EOF
|
---|
| 2859 | $self->{_EOF} = 1;
|
---|
| 2860 | }
|
---|
| 2861 | if ($ctx->{auto_diag}) {
|
---|
| 2862 | $self->error_diag;
|
---|
| 2863 | }
|
---|
| 2864 | return $last_error;
|
---|
| 2865 | }
|
---|
| 2866 |
|
---|
| 2867 | sub SetDiag {
|
---|
| 2868 | my ($self, $error, $errstr) = @_;
|
---|
| 2869 | my $res;
|
---|
| 2870 | if (ref $self) {
|
---|
| 2871 | my $ctx = $self->_setup_ctx;
|
---|
| 2872 | $res = $self->_set_diag($ctx, $error);
|
---|
| 2873 |
|
---|
| 2874 | } else {
|
---|
| 2875 | $res = $self->_sv_diag($error);
|
---|
| 2876 | }
|
---|
| 2877 | if (defined $errstr) {
|
---|
| 2878 | $res->[1] = $errstr;
|
---|
| 2879 | }
|
---|
| 2880 | $res;
|
---|
| 2881 | }
|
---|
| 2882 |
|
---|
| 2883 | ################################################################################
|
---|
| 2884 | package Text::CSV::ErrorDiag;
|
---|
| 2885 |
|
---|
| 2886 | use strict;
|
---|
| 2887 | use overload (
|
---|
| 2888 | '""' => \&stringify,
|
---|
| 2889 | '+' => \&numeric,
|
---|
| 2890 | '-' => \&numeric,
|
---|
| 2891 | '*' => \&numeric,
|
---|
| 2892 | '/' => \&numeric,
|
---|
| 2893 | fallback => 1,
|
---|
| 2894 | );
|
---|
| 2895 |
|
---|
| 2896 |
|
---|
| 2897 | sub numeric {
|
---|
| 2898 | my ($left, $right) = @_;
|
---|
| 2899 | return ref $left ? $left->[0] : $right->[0];
|
---|
| 2900 | }
|
---|
| 2901 |
|
---|
| 2902 |
|
---|
| 2903 | sub stringify {
|
---|
| 2904 | $_[0]->[1];
|
---|
| 2905 | }
|
---|
| 2906 | ################################################################################
|
---|
| 2907 | 1;
|
---|
| 2908 | __END__
|
---|
| 2909 |
|
---|
| 2910 | =head1 NAME
|
---|
| 2911 |
|
---|
| 2912 | Text::CSV_PP - Text::CSV_XS compatible pure-Perl module
|
---|
| 2913 |
|
---|
| 2914 |
|
---|
| 2915 | =head1 SYNOPSIS
|
---|
| 2916 |
|
---|
| 2917 | This section is taken from Text::CSV_XS.
|
---|
| 2918 |
|
---|
| 2919 | # Functional interface
|
---|
| 2920 | use Text::CSV_PP qw( csv );
|
---|
| 2921 |
|
---|
| 2922 | # Read whole file in memory
|
---|
| 2923 | my $aoa = csv (in => "data.csv"); # as array of array
|
---|
| 2924 | my $aoh = csv (in => "data.csv",
|
---|
| 2925 | headers => "auto"); # as array of hash
|
---|
| 2926 |
|
---|
| 2927 | # Write array of arrays as csv file
|
---|
| 2928 | csv (in => $aoa, out => "file.csv", sep_char=> ";");
|
---|
| 2929 |
|
---|
| 2930 | # Only show lines where "code" is odd
|
---|
| 2931 | csv (in => "data.csv", filter => { code => sub { $_ % 2 }});
|
---|
| 2932 |
|
---|
| 2933 | # Object interface
|
---|
| 2934 | use Text::CSV_PP;
|
---|
| 2935 |
|
---|
| 2936 | my @rows;
|
---|
| 2937 | # Read/parse CSV
|
---|
| 2938 | my $csv = Text::CSV_PP->new ({ binary => 1, auto_diag => 1 });
|
---|
| 2939 | open my $fh, "<:encoding(utf8)", "test.csv" or die "test.csv: $!";
|
---|
| 2940 | while (my $row = $csv->getline ($fh)) {
|
---|
| 2941 | $row->[2] =~ m/pattern/ or next; # 3rd field should match
|
---|
| 2942 | push @rows, $row;
|
---|
| 2943 | }
|
---|
| 2944 | close $fh;
|
---|
| 2945 |
|
---|
| 2946 | # and write as CSV
|
---|
| 2947 | open $fh, ">:encoding(utf8)", "new.csv" or die "new.csv: $!";
|
---|
| 2948 | $csv->say ($fh, $_) for @rows;
|
---|
| 2949 | close $fh or die "new.csv: $!";
|
---|
| 2950 |
|
---|
| 2951 | =head1 DESCRIPTION
|
---|
| 2952 |
|
---|
| 2953 | Text::CSV_PP is a pure-perl module that provides facilities for the
|
---|
| 2954 | composition and decomposition of comma-separated values. This is
|
---|
| 2955 | (almost) compatible with much faster L<Text::CSV_XS>, and mainly
|
---|
| 2956 | used as its fallback module when you use L<Text::CSV> module without
|
---|
| 2957 | having installed Text::CSV_XS. If you don't have any reason to use
|
---|
| 2958 | this module directly, use Text::CSV for speed boost and portability
|
---|
| 2959 | (or maybe Text::CSV_XS when you write an one-off script and don't need
|
---|
| 2960 | to care about portability).
|
---|
| 2961 |
|
---|
| 2962 | The following caveats are taken from the doc of Text::CSV_XS.
|
---|
| 2963 |
|
---|
| 2964 | =head2 Embedded newlines
|
---|
| 2965 |
|
---|
| 2966 | B<Important Note>: The default behavior is to accept only ASCII characters
|
---|
| 2967 | in the range from C<0x20> (space) to C<0x7E> (tilde). This means that the
|
---|
| 2968 | fields can not contain newlines. If your data contains newlines embedded in
|
---|
| 2969 | fields, or characters above C<0x7E> (tilde), or binary data, you B<I<must>>
|
---|
| 2970 | set C<< binary => 1 >> in the call to L</new>. To cover the widest range of
|
---|
| 2971 | parsing options, you will always want to set binary.
|
---|
| 2972 |
|
---|
| 2973 | But you still have the problem that you have to pass a correct line to the
|
---|
| 2974 | L</parse> method, which is more complicated from the usual point of usage:
|
---|
| 2975 |
|
---|
| 2976 | my $csv = Text::CSV_PP->new ({ binary => 1, eol => $/ });
|
---|
| 2977 | while (<>) { # WRONG!
|
---|
| 2978 | $csv->parse ($_);
|
---|
| 2979 | my @fields = $csv->fields ();
|
---|
| 2980 | }
|
---|
| 2981 |
|
---|
| 2982 | this will break, as the C<while> might read broken lines: it does not care
|
---|
| 2983 | about the quoting. If you need to support embedded newlines, the way to go
|
---|
| 2984 | is to B<not> pass L<C<eol>|/eol> in the parser (it accepts C<\n>, C<\r>,
|
---|
| 2985 | B<and> C<\r\n> by default) and then
|
---|
| 2986 |
|
---|
| 2987 | my $csv = Text::CSV_PP->new ({ binary => 1 });
|
---|
| 2988 | open my $fh, "<", $file or die "$file: $!";
|
---|
| 2989 | while (my $row = $csv->getline ($fh)) {
|
---|
| 2990 | my @fields = @$row;
|
---|
| 2991 | }
|
---|
| 2992 |
|
---|
| 2993 | The old(er) way of using global file handles is still supported
|
---|
| 2994 |
|
---|
| 2995 | while (my $row = $csv->getline (*ARGV)) { ... }
|
---|
| 2996 |
|
---|
| 2997 | =head2 Unicode
|
---|
| 2998 |
|
---|
| 2999 | Unicode is only tested to work with perl-5.8.2 and up.
|
---|
| 3000 |
|
---|
| 3001 | See also L</BOM>.
|
---|
| 3002 |
|
---|
| 3003 | The simplest way to ensure the correct encoding is used for in- and output
|
---|
| 3004 | is by either setting layers on the filehandles, or setting the L</encoding>
|
---|
| 3005 | argument for L</csv>.
|
---|
| 3006 |
|
---|
| 3007 | open my $fh, "<:encoding(UTF-8)", "in.csv" or die "in.csv: $!";
|
---|
| 3008 | or
|
---|
| 3009 | my $aoa = csv (in => "in.csv", encoding => "UTF-8");
|
---|
| 3010 |
|
---|
| 3011 | open my $fh, ">:encoding(UTF-8)", "out.csv" or die "out.csv: $!";
|
---|
| 3012 | or
|
---|
| 3013 | csv (in => $aoa, out => "out.csv", encoding => "UTF-8");
|
---|
| 3014 |
|
---|
| 3015 | On parsing (both for L</getline> and L</parse>), if the source is marked
|
---|
| 3016 | being UTF8, then all fields that are marked binary will also be marked UTF8.
|
---|
| 3017 |
|
---|
| 3018 | On combining (L</print> and L</combine>): if any of the combining fields
|
---|
| 3019 | was marked UTF8, the resulting string will be marked as UTF8. Note however
|
---|
| 3020 | that all fields I<before> the first field marked UTF8 and contained 8-bit
|
---|
| 3021 | characters that were not upgraded to UTF8, these will be C<bytes> in the
|
---|
| 3022 | resulting string too, possibly causing unexpected errors. If you pass data
|
---|
| 3023 | of different encoding, or you don't know if there is different encoding,
|
---|
| 3024 | force it to be upgraded before you pass them on:
|
---|
| 3025 |
|
---|
| 3026 | $csv->print ($fh, [ map { utf8::upgrade (my $x = $_); $x } @data ]);
|
---|
| 3027 |
|
---|
| 3028 | For complete control over encoding, please use L<Text::CSV::Encoded>:
|
---|
| 3029 |
|
---|
| 3030 | use Text::CSV::Encoded;
|
---|
| 3031 | my $csv = Text::CSV::Encoded->new ({
|
---|
| 3032 | encoding_in => "iso-8859-1", # the encoding comes into Perl
|
---|
| 3033 | encoding_out => "cp1252", # the encoding comes out of Perl
|
---|
| 3034 | });
|
---|
| 3035 |
|
---|
| 3036 | $csv = Text::CSV::Encoded->new ({ encoding => "utf8" });
|
---|
| 3037 | # combine () and print () accept *literally* utf8 encoded data
|
---|
| 3038 | # parse () and getline () return *literally* utf8 encoded data
|
---|
| 3039 |
|
---|
| 3040 | $csv = Text::CSV::Encoded->new ({ encoding => undef }); # default
|
---|
| 3041 | # combine () and print () accept UTF8 marked data
|
---|
| 3042 | # parse () and getline () return UTF8 marked data
|
---|
| 3043 |
|
---|
| 3044 | =head2 BOM
|
---|
| 3045 |
|
---|
| 3046 | BOM (or Byte Order Mark) handling is available only inside the L</header>
|
---|
| 3047 | method. This method supports the following encodings: C<utf-8>, C<utf-1>,
|
---|
| 3048 | C<utf-32be>, C<utf-32le>, C<utf-16be>, C<utf-16le>, C<utf-ebcdic>, C<scsu>,
|
---|
| 3049 | C<bocu-1>, and C<gb-18030>. See L<Wikipedia|https://en.wikipedia.org/wiki/Byte_order_mark>.
|
---|
| 3050 |
|
---|
| 3051 | If a file has a BOM, the easiest way to deal with that is
|
---|
| 3052 |
|
---|
| 3053 | my $aoh = csv (in => $file, detect_bom => 1);
|
---|
| 3054 |
|
---|
| 3055 | All records will be encoded based on the detected BOM.
|
---|
| 3056 |
|
---|
| 3057 | This implies a call to the L</header> method, which defaults to also set
|
---|
| 3058 | the L</column_names>. So this is B<not> the same as
|
---|
| 3059 |
|
---|
| 3060 | my $aoh = csv (in => $file, headers => "auto");
|
---|
| 3061 |
|
---|
| 3062 | which only reads the first record to set L</column_names> but ignores any
|
---|
| 3063 | meaning of possible present BOM.
|
---|
| 3064 |
|
---|
| 3065 | =head1 METHODS
|
---|
| 3066 |
|
---|
| 3067 | This section is also taken from Text::CSV_XS.
|
---|
| 3068 |
|
---|
| 3069 | =head2 version
|
---|
| 3070 |
|
---|
| 3071 | (Class method) Returns the current module version.
|
---|
| 3072 |
|
---|
| 3073 | =head2 new
|
---|
| 3074 |
|
---|
| 3075 | (Class method) Returns a new instance of class Text::CSV_PP. The attributes
|
---|
| 3076 | are described by the (optional) hash ref C<\%attr>.
|
---|
| 3077 |
|
---|
| 3078 | my $csv = Text::CSV_PP->new ({ attributes ... });
|
---|
| 3079 |
|
---|
| 3080 | The following attributes are available:
|
---|
| 3081 |
|
---|
| 3082 | =head3 eol
|
---|
| 3083 |
|
---|
| 3084 | my $csv = Text::CSV_PP->new ({ eol => $/ });
|
---|
| 3085 | $csv->eol (undef);
|
---|
| 3086 | my $eol = $csv->eol;
|
---|
| 3087 |
|
---|
| 3088 | The end-of-line string to add to rows for L</print> or the record separator
|
---|
| 3089 | for L</getline>.
|
---|
| 3090 |
|
---|
| 3091 | When not passed in a B<parser> instance, the default behavior is to accept
|
---|
| 3092 | C<\n>, C<\r>, and C<\r\n>, so it is probably safer to not specify C<eol> at
|
---|
| 3093 | all. Passing C<undef> or the empty string behave the same.
|
---|
| 3094 |
|
---|
| 3095 | When not passed in a B<generating> instance, records are not terminated at
|
---|
| 3096 | all, so it is probably wise to pass something you expect. A safe choice for
|
---|
| 3097 | C<eol> on output is either C<$/> or C<\r\n>.
|
---|
| 3098 |
|
---|
| 3099 | Common values for C<eol> are C<"\012"> (C<\n> or Line Feed), C<"\015\012">
|
---|
| 3100 | (C<\r\n> or Carriage Return, Line Feed), and C<"\015"> (C<\r> or Carriage
|
---|
| 3101 | Return). The L<C<eol>|/eol> attribute cannot exceed 7 (ASCII) characters.
|
---|
| 3102 |
|
---|
| 3103 | If both C<$/> and L<C<eol>|/eol> equal C<"\015">, parsing lines that end on
|
---|
| 3104 | only a Carriage Return without Line Feed, will be L</parse>d correct.
|
---|
| 3105 |
|
---|
| 3106 | =head3 sep_char
|
---|
| 3107 |
|
---|
| 3108 | my $csv = Text::CSV_PP->new ({ sep_char => ";" });
|
---|
| 3109 | $csv->sep_char (";");
|
---|
| 3110 | my $c = $csv->sep_char;
|
---|
| 3111 |
|
---|
| 3112 | The char used to separate fields, by default a comma. (C<,>). Limited to a
|
---|
| 3113 | single-byte character, usually in the range from C<0x20> (space) to C<0x7E>
|
---|
| 3114 | (tilde). When longer sequences are required, use L<C<sep>|/sep>.
|
---|
| 3115 |
|
---|
| 3116 | The separation character can not be equal to the quote character or to the
|
---|
| 3117 | escape character.
|
---|
| 3118 |
|
---|
| 3119 | =head3 sep
|
---|
| 3120 |
|
---|
| 3121 | my $csv = Text::CSV_PP->new ({ sep => "\N{FULLWIDTH COMMA}" });
|
---|
| 3122 | $csv->sep (";");
|
---|
| 3123 | my $sep = $csv->sep;
|
---|
| 3124 |
|
---|
| 3125 | The chars used to separate fields, by default undefined. Limited to 8 bytes.
|
---|
| 3126 |
|
---|
| 3127 | When set, overrules L<C<sep_char>|/sep_char>. If its length is one byte it
|
---|
| 3128 | acts as an alias to L<C<sep_char>|/sep_char>.
|
---|
| 3129 |
|
---|
| 3130 | =head3 quote_char
|
---|
| 3131 |
|
---|
| 3132 | my $csv = Text::CSV_PP->new ({ quote_char => "'" });
|
---|
| 3133 | $csv->quote_char (undef);
|
---|
| 3134 | my $c = $csv->quote_char;
|
---|
| 3135 |
|
---|
| 3136 | The character to quote fields containing blanks or binary data, by default
|
---|
| 3137 | the double quote character (C<">). A value of undef suppresses quote chars
|
---|
| 3138 | (for simple cases only). Limited to a single-byte character, usually in the
|
---|
| 3139 | range from C<0x20> (space) to C<0x7E> (tilde). When longer sequences are
|
---|
| 3140 | required, use L<C<quote>|/quote>.
|
---|
| 3141 |
|
---|
| 3142 | C<quote_char> can not be equal to L<C<sep_char>|/sep_char>.
|
---|
| 3143 |
|
---|
| 3144 | =head3 quote
|
---|
| 3145 |
|
---|
| 3146 | my $csv = Text::CSV_PP->new ({ quote => "\N{FULLWIDTH QUOTATION MARK}" });
|
---|
| 3147 | $csv->quote ("'");
|
---|
| 3148 | my $quote = $csv->quote;
|
---|
| 3149 |
|
---|
| 3150 | The chars used to quote fields, by default undefined. Limited to 8 bytes.
|
---|
| 3151 |
|
---|
| 3152 | When set, overrules L<C<quote_char>|/quote_char>. If its length is one byte
|
---|
| 3153 | it acts as an alias to L<C<quote_char>|/quote_char>.
|
---|
| 3154 |
|
---|
| 3155 | =head3 escape_char
|
---|
| 3156 |
|
---|
| 3157 | my $csv = Text::CSV_PP->new ({ escape_char => "\\" });
|
---|
| 3158 | $csv->escape_char (":");
|
---|
| 3159 | my $c = $csv->escape_char;
|
---|
| 3160 |
|
---|
| 3161 | The character to escape certain characters inside quoted fields. This is
|
---|
| 3162 | limited to a single-byte character, usually in the range from C<0x20>
|
---|
| 3163 | (space) to C<0x7E> (tilde).
|
---|
| 3164 |
|
---|
| 3165 | The C<escape_char> defaults to being the double-quote mark (C<">). In other
|
---|
| 3166 | words the same as the default L<C<quote_char>|/quote_char>. This means that
|
---|
| 3167 | doubling the quote mark in a field escapes it:
|
---|
| 3168 |
|
---|
| 3169 | "foo","bar","Escape ""quote mark"" with two ""quote marks""","baz"
|
---|
| 3170 |
|
---|
| 3171 | If you change the L<C<quote_char>|/quote_char> without changing the
|
---|
| 3172 | C<escape_char>, the C<escape_char> will still be the double-quote (C<">).
|
---|
| 3173 | If instead you want to escape the L<C<quote_char>|/quote_char> by doubling
|
---|
| 3174 | it you will need to also change the C<escape_char> to be the same as what
|
---|
| 3175 | you have changed the L<C<quote_char>|/quote_char> to.
|
---|
| 3176 |
|
---|
| 3177 | Setting C<escape_char> to <undef> or C<""> will disable escaping completely
|
---|
| 3178 | and is greatly discouraged. This will also disable C<escape_null>.
|
---|
| 3179 |
|
---|
| 3180 | The escape character can not be equal to the separation character.
|
---|
| 3181 |
|
---|
| 3182 | =head3 binary
|
---|
| 3183 |
|
---|
| 3184 | my $csv = Text::CSV_PP->new ({ binary => 1 });
|
---|
| 3185 | $csv->binary (0);
|
---|
| 3186 | my $f = $csv->binary;
|
---|
| 3187 |
|
---|
| 3188 | If this attribute is C<1>, you may use binary characters in quoted fields,
|
---|
| 3189 | including line feeds, carriage returns and C<NULL> bytes. (The latter could
|
---|
| 3190 | be escaped as C<"0>.) By default this feature is off.
|
---|
| 3191 |
|
---|
| 3192 | If a string is marked UTF8, C<binary> will be turned on automatically when
|
---|
| 3193 | binary characters other than C<CR> and C<NL> are encountered. Note that a
|
---|
| 3194 | simple string like C<"\x{00a0}"> might still be binary, but not marked UTF8,
|
---|
| 3195 | so setting C<< { binary => 1 } >> is still a wise option.
|
---|
| 3196 |
|
---|
| 3197 | =head3 strict
|
---|
| 3198 |
|
---|
| 3199 | my $csv = Text::CSV_PP->new ({ strict => 1 });
|
---|
| 3200 | $csv->strict (0);
|
---|
| 3201 | my $f = $csv->strict;
|
---|
| 3202 |
|
---|
| 3203 | If this attribute is set to C<1>, any row that parses to a different number
|
---|
| 3204 | of fields than the previous row will cause the parser to throw error 2014.
|
---|
| 3205 |
|
---|
| 3206 | =head3 formula_handling
|
---|
| 3207 |
|
---|
| 3208 | =head3 formula
|
---|
| 3209 |
|
---|
| 3210 | my $csv = Text::CSV_PP->new ({ formula => "none" });
|
---|
| 3211 | $csv->formula ("none");
|
---|
| 3212 | my $f = $csv->formula;
|
---|
| 3213 |
|
---|
| 3214 | This defines the behavior of fields containing I<formulas>. As formulas are
|
---|
| 3215 | considered dangerous in spreadsheets, this attribute can define an optional
|
---|
| 3216 | action to be taken if a field starts with an equal sign (C<=>).
|
---|
| 3217 |
|
---|
| 3218 | For purpose of code-readability, this can also be written as
|
---|
| 3219 |
|
---|
| 3220 | my $csv = Text::CSV_PP->new ({ formula_handling => "none" });
|
---|
| 3221 | $csv->formula_handling ("none");
|
---|
| 3222 | my $f = $csv->formula_handling;
|
---|
| 3223 |
|
---|
| 3224 | Possible values for this attribute are
|
---|
| 3225 |
|
---|
| 3226 | =over 2
|
---|
| 3227 |
|
---|
| 3228 | =item none
|
---|
| 3229 |
|
---|
| 3230 | Take no specific action. This is the default.
|
---|
| 3231 |
|
---|
| 3232 | $csv->formula ("none");
|
---|
| 3233 |
|
---|
| 3234 | =item die
|
---|
| 3235 |
|
---|
| 3236 | Cause the process to C<die> whenever a leading C<=> is encountered.
|
---|
| 3237 |
|
---|
| 3238 | $csv->formula ("die");
|
---|
| 3239 |
|
---|
| 3240 | =item croak
|
---|
| 3241 |
|
---|
| 3242 | Cause the process to C<croak> whenever a leading C<=> is encountered. (See
|
---|
| 3243 | L<Carp>)
|
---|
| 3244 |
|
---|
| 3245 | $csv->formula ("croak");
|
---|
| 3246 |
|
---|
| 3247 | =item diag
|
---|
| 3248 |
|
---|
| 3249 | Report position and content of the field whenever a leading C<=> is found.
|
---|
| 3250 | The value of the field is unchanged.
|
---|
| 3251 |
|
---|
| 3252 | $csv->formula ("diag");
|
---|
| 3253 |
|
---|
| 3254 | =item empty
|
---|
| 3255 |
|
---|
| 3256 | Replace the content of fields that start with a C<=> with the empty string.
|
---|
| 3257 |
|
---|
| 3258 | $csv->formula ("empty");
|
---|
| 3259 | $csv->formula ("");
|
---|
| 3260 |
|
---|
| 3261 | =item undef
|
---|
| 3262 |
|
---|
| 3263 | Replace the content of fields that start with a C<=> with C<undef>.
|
---|
| 3264 |
|
---|
| 3265 | $csv->formula ("undef");
|
---|
| 3266 | $csv->formula (undef);
|
---|
| 3267 |
|
---|
| 3268 | =back
|
---|
| 3269 |
|
---|
| 3270 | All other values will give a warning and then fallback to C<diag>.
|
---|
| 3271 |
|
---|
| 3272 | =head3 decode_utf8
|
---|
| 3273 |
|
---|
| 3274 | my $csv = Text::CSV_PP->new ({ decode_utf8 => 1 });
|
---|
| 3275 | $csv->decode_utf8 (0);
|
---|
| 3276 | my $f = $csv->decode_utf8;
|
---|
| 3277 |
|
---|
| 3278 | This attributes defaults to TRUE.
|
---|
| 3279 |
|
---|
| 3280 | While I<parsing>, fields that are valid UTF-8, are automatically set to be
|
---|
| 3281 | UTF-8, so that
|
---|
| 3282 |
|
---|
| 3283 | $csv->parse ("\xC4\xA8\n");
|
---|
| 3284 |
|
---|
| 3285 | results in
|
---|
| 3286 |
|
---|
| 3287 | PV("\304\250"\0) [UTF8 "\x{128}"]
|
---|
| 3288 |
|
---|
| 3289 | Sometimes it might not be a desired action. To prevent those upgrades, set
|
---|
| 3290 | this attribute to false, and the result will be
|
---|
| 3291 |
|
---|
| 3292 | PV("\304\250"\0)
|
---|
| 3293 |
|
---|
| 3294 | =head3 auto_diag
|
---|
| 3295 |
|
---|
| 3296 | my $csv = Text::CSV_PP->new ({ auto_diag => 1 });
|
---|
| 3297 | $csv->auto_diag (2);
|
---|
| 3298 | my $l = $csv->auto_diag;
|
---|
| 3299 |
|
---|
| 3300 | Set this attribute to a number between C<1> and C<9> causes L</error_diag>
|
---|
| 3301 | to be automatically called in void context upon errors.
|
---|
| 3302 |
|
---|
| 3303 | In case of error C<2012 - EOF>, this call will be void.
|
---|
| 3304 |
|
---|
| 3305 | If C<auto_diag> is set to a numeric value greater than C<1>, it will C<die>
|
---|
| 3306 | on errors instead of C<warn>. If set to anything unrecognized, it will be
|
---|
| 3307 | silently ignored.
|
---|
| 3308 |
|
---|
| 3309 | Future extensions to this feature will include more reliable auto-detection
|
---|
| 3310 | of C<autodie> being active in the scope of which the error occurred which
|
---|
| 3311 | will increment the value of C<auto_diag> with C<1> the moment the error is
|
---|
| 3312 | detected.
|
---|
| 3313 |
|
---|
| 3314 | =head3 diag_verbose
|
---|
| 3315 |
|
---|
| 3316 | my $csv = Text::CSV_PP->new ({ diag_verbose => 1 });
|
---|
| 3317 | $csv->diag_verbose (2);
|
---|
| 3318 | my $l = $csv->diag_verbose;
|
---|
| 3319 |
|
---|
| 3320 | Set the verbosity of the output triggered by C<auto_diag>. Currently only
|
---|
| 3321 | adds the current input-record-number (if known) to the diagnostic output
|
---|
| 3322 | with an indication of the position of the error.
|
---|
| 3323 |
|
---|
| 3324 | =head3 blank_is_undef
|
---|
| 3325 |
|
---|
| 3326 | my $csv = Text::CSV_PP->new ({ blank_is_undef => 1 });
|
---|
| 3327 | $csv->blank_is_undef (0);
|
---|
| 3328 | my $f = $csv->blank_is_undef;
|
---|
| 3329 |
|
---|
| 3330 | Under normal circumstances, C<CSV> data makes no distinction between quoted-
|
---|
| 3331 | and unquoted empty fields. These both end up in an empty string field once
|
---|
| 3332 | read, thus
|
---|
| 3333 |
|
---|
| 3334 | 1,"",," ",2
|
---|
| 3335 |
|
---|
| 3336 | is read as
|
---|
| 3337 |
|
---|
| 3338 | ("1", "", "", " ", "2")
|
---|
| 3339 |
|
---|
| 3340 | When I<writing> C<CSV> files with either L<C<always_quote>|/always_quote>
|
---|
| 3341 | or L<C<quote_empty>|/quote_empty> set, the unquoted I<empty> field is the
|
---|
| 3342 | result of an undefined value. To enable this distinction when I<reading>
|
---|
| 3343 | C<CSV> data, the C<blank_is_undef> attribute will cause unquoted empty
|
---|
| 3344 | fields to be set to C<undef>, causing the above to be parsed as
|
---|
| 3345 |
|
---|
| 3346 | ("1", "", undef, " ", "2")
|
---|
| 3347 |
|
---|
| 3348 | note that this is specifically important when loading C<CSV> fields into a
|
---|
| 3349 | database that allows C<NULL> values, as the perl equivalent for C<NULL> is
|
---|
| 3350 | C<undef> in L<DBI> land.
|
---|
| 3351 |
|
---|
| 3352 | =head3 empty_is_undef
|
---|
| 3353 |
|
---|
| 3354 | my $csv = Text::CSV_PP->new ({ empty_is_undef => 1 });
|
---|
| 3355 | $csv->empty_is_undef (0);
|
---|
| 3356 | my $f = $csv->empty_is_undef;
|
---|
| 3357 |
|
---|
| 3358 | Going one step further than L<C<blank_is_undef>|/blank_is_undef>, this
|
---|
| 3359 | attribute converts all empty fields to C<undef>, so
|
---|
| 3360 |
|
---|
| 3361 | 1,"",," ",2
|
---|
| 3362 |
|
---|
| 3363 | is read as
|
---|
| 3364 |
|
---|
| 3365 | (1, undef, undef, " ", 2)
|
---|
| 3366 |
|
---|
| 3367 | Note that this effects only fields that are originally empty, not fields
|
---|
| 3368 | that are empty after stripping allowed whitespace. YMMV.
|
---|
| 3369 |
|
---|
| 3370 | =head3 allow_whitespace
|
---|
| 3371 |
|
---|
| 3372 | my $csv = Text::CSV_PP->new ({ allow_whitespace => 1 });
|
---|
| 3373 | $csv->allow_whitespace (0);
|
---|
| 3374 | my $f = $csv->allow_whitespace;
|
---|
| 3375 |
|
---|
| 3376 | When this option is set to true, the whitespace (C<TAB>'s and C<SPACE>'s)
|
---|
| 3377 | surrounding the separation character is removed when parsing. If either
|
---|
| 3378 | C<TAB> or C<SPACE> is one of the three characters L<C<sep_char>|/sep_char>,
|
---|
| 3379 | L<C<quote_char>|/quote_char>, or L<C<escape_char>|/escape_char> it will not
|
---|
| 3380 | be considered whitespace.
|
---|
| 3381 |
|
---|
| 3382 | Now lines like:
|
---|
| 3383 |
|
---|
| 3384 | 1 , "foo" , bar , 3 , zapp
|
---|
| 3385 |
|
---|
| 3386 | are parsed as valid C<CSV>, even though it violates the C<CSV> specs.
|
---|
| 3387 |
|
---|
| 3388 | Note that B<all> whitespace is stripped from both start and end of each
|
---|
| 3389 | field. That would make it I<more> than a I<feature> to enable parsing bad
|
---|
| 3390 | C<CSV> lines, as
|
---|
| 3391 |
|
---|
| 3392 | 1, 2.0, 3, ape , monkey
|
---|
| 3393 |
|
---|
| 3394 | will now be parsed as
|
---|
| 3395 |
|
---|
| 3396 | ("1", "2.0", "3", "ape", "monkey")
|
---|
| 3397 |
|
---|
| 3398 | even if the original line was perfectly acceptable C<CSV>.
|
---|
| 3399 |
|
---|
| 3400 | =head3 allow_loose_quotes
|
---|
| 3401 |
|
---|
| 3402 | my $csv = Text::CSV_PP->new ({ allow_loose_quotes => 1 });
|
---|
| 3403 | $csv->allow_loose_quotes (0);
|
---|
| 3404 | my $f = $csv->allow_loose_quotes;
|
---|
| 3405 |
|
---|
| 3406 | By default, parsing unquoted fields containing L<C<quote_char>|/quote_char>
|
---|
| 3407 | characters like
|
---|
| 3408 |
|
---|
| 3409 | 1,foo "bar" baz,42
|
---|
| 3410 |
|
---|
| 3411 | would result in parse error 2034. Though it is still bad practice to allow
|
---|
| 3412 | this format, we cannot help the fact that some vendors make their
|
---|
| 3413 | applications spit out lines styled this way.
|
---|
| 3414 |
|
---|
| 3415 | If there is B<really> bad C<CSV> data, like
|
---|
| 3416 |
|
---|
| 3417 | 1,"foo "bar" baz",42
|
---|
| 3418 |
|
---|
| 3419 | or
|
---|
| 3420 |
|
---|
| 3421 | 1,""foo bar baz"",42
|
---|
| 3422 |
|
---|
| 3423 | there is a way to get this data-line parsed and leave the quotes inside the
|
---|
| 3424 | quoted field as-is. This can be achieved by setting C<allow_loose_quotes>
|
---|
| 3425 | B<AND> making sure that the L<C<escape_char>|/escape_char> is I<not> equal
|
---|
| 3426 | to L<C<quote_char>|/quote_char>.
|
---|
| 3427 |
|
---|
| 3428 | =head3 allow_loose_escapes
|
---|
| 3429 |
|
---|
| 3430 | my $csv = Text::CSV_PP->new ({ allow_loose_escapes => 1 });
|
---|
| 3431 | $csv->allow_loose_escapes (0);
|
---|
| 3432 | my $f = $csv->allow_loose_escapes;
|
---|
| 3433 |
|
---|
| 3434 | Parsing fields that have L<C<escape_char>|/escape_char> characters that
|
---|
| 3435 | escape characters that do not need to be escaped, like:
|
---|
| 3436 |
|
---|
| 3437 | my $csv = Text::CSV_PP->new ({ escape_char => "\\" });
|
---|
| 3438 | $csv->parse (qq{1,"my bar\'s",baz,42});
|
---|
| 3439 |
|
---|
| 3440 | would result in parse error 2025. Though it is bad practice to allow this
|
---|
| 3441 | format, this attribute enables you to treat all escape character sequences
|
---|
| 3442 | equal.
|
---|
| 3443 |
|
---|
| 3444 | =head3 allow_unquoted_escape
|
---|
| 3445 |
|
---|
| 3446 | my $csv = Text::CSV_PP->new ({ allow_unquoted_escape => 1 });
|
---|
| 3447 | $csv->allow_unquoted_escape (0);
|
---|
| 3448 | my $f = $csv->allow_unquoted_escape;
|
---|
| 3449 |
|
---|
| 3450 | A backward compatibility issue where L<C<escape_char>|/escape_char> differs
|
---|
| 3451 | from L<C<quote_char>|/quote_char> prevents L<C<escape_char>|/escape_char>
|
---|
| 3452 | to be in the first position of a field. If L<C<quote_char>|/quote_char> is
|
---|
| 3453 | equal to the default C<"> and L<C<escape_char>|/escape_char> is set to C<\>,
|
---|
| 3454 | this would be illegal:
|
---|
| 3455 |
|
---|
| 3456 | 1,\0,2
|
---|
| 3457 |
|
---|
| 3458 | Setting this attribute to C<1> might help to overcome issues with backward
|
---|
| 3459 | compatibility and allow this style.
|
---|
| 3460 |
|
---|
| 3461 | =head3 always_quote
|
---|
| 3462 |
|
---|
| 3463 | my $csv = Text::CSV_PP->new ({ always_quote => 1 });
|
---|
| 3464 | $csv->always_quote (0);
|
---|
| 3465 | my $f = $csv->always_quote;
|
---|
| 3466 |
|
---|
| 3467 | By default the generated fields are quoted only if they I<need> to be. For
|
---|
| 3468 | example, if they contain the separator character. If you set this attribute
|
---|
| 3469 | to C<1> then I<all> defined fields will be quoted. (C<undef> fields are not
|
---|
| 3470 | quoted, see L</blank_is_undef>). This makes it quite often easier to handle
|
---|
| 3471 | exported data in external applications.
|
---|
| 3472 |
|
---|
| 3473 | =head3 quote_space
|
---|
| 3474 |
|
---|
| 3475 | my $csv = Text::CSV_PP->new ({ quote_space => 1 });
|
---|
| 3476 | $csv->quote_space (0);
|
---|
| 3477 | my $f = $csv->quote_space;
|
---|
| 3478 |
|
---|
| 3479 | By default, a space in a field would trigger quotation. As no rule exists
|
---|
| 3480 | this to be forced in C<CSV>, nor any for the opposite, the default is true
|
---|
| 3481 | for safety. You can exclude the space from this trigger by setting this
|
---|
| 3482 | attribute to 0.
|
---|
| 3483 |
|
---|
| 3484 | =head3 quote_empty
|
---|
| 3485 |
|
---|
| 3486 | my $csv = Text::CSV_PP->new ({ quote_empty => 1 });
|
---|
| 3487 | $csv->quote_empty (0);
|
---|
| 3488 | my $f = $csv->quote_empty;
|
---|
| 3489 |
|
---|
| 3490 | By default the generated fields are quoted only if they I<need> to be. An
|
---|
| 3491 | empty (defined) field does not need quotation. If you set this attribute to
|
---|
| 3492 | C<1> then I<empty> defined fields will be quoted. (C<undef> fields are not
|
---|
| 3493 | quoted, see L</blank_is_undef>). See also L<C<always_quote>|/always_quote>.
|
---|
| 3494 |
|
---|
| 3495 | =head3 quote_binary
|
---|
| 3496 |
|
---|
| 3497 | my $csv = Text::CSV_PP->new ({ quote_binary => 1 });
|
---|
| 3498 | $csv->quote_binary (0);
|
---|
| 3499 | my $f = $csv->quote_binary;
|
---|
| 3500 |
|
---|
| 3501 | By default, all "unsafe" bytes inside a string cause the combined field to
|
---|
| 3502 | be quoted. By setting this attribute to C<0>, you can disable that trigger
|
---|
| 3503 | for bytes >= C<0x7F>.
|
---|
| 3504 |
|
---|
| 3505 | =head3 escape_null
|
---|
| 3506 |
|
---|
| 3507 | my $csv = Text::CSV_PP->new ({ escape_null => 1 });
|
---|
| 3508 | $csv->escape_null (0);
|
---|
| 3509 | my $f = $csv->escape_null;
|
---|
| 3510 |
|
---|
| 3511 | By default, a C<NULL> byte in a field would be escaped. This option enables
|
---|
| 3512 | you to treat the C<NULL> byte as a simple binary character in binary mode
|
---|
| 3513 | (the C<< { binary => 1 } >> is set). The default is true. You can prevent
|
---|
| 3514 | C<NULL> escapes by setting this attribute to C<0>.
|
---|
| 3515 |
|
---|
| 3516 | When the C<escape_char> attribute is set to undefined, this attribute will
|
---|
| 3517 | be set to false.
|
---|
| 3518 |
|
---|
| 3519 | The default setting will encode "=\x00=" as
|
---|
| 3520 |
|
---|
| 3521 | "="0="
|
---|
| 3522 |
|
---|
| 3523 | With C<escape_null> set, this will result in
|
---|
| 3524 |
|
---|
| 3525 | "=\x00="
|
---|
| 3526 |
|
---|
| 3527 | The default when using the C<csv> function is C<false>.
|
---|
| 3528 |
|
---|
| 3529 | For backward compatibility reasons, the deprecated old name C<quote_null>
|
---|
| 3530 | is still recognized.
|
---|
| 3531 |
|
---|
| 3532 | =head3 keep_meta_info
|
---|
| 3533 |
|
---|
| 3534 | my $csv = Text::CSV_PP->new ({ keep_meta_info => 1 });
|
---|
| 3535 | $csv->keep_meta_info (0);
|
---|
| 3536 | my $f = $csv->keep_meta_info;
|
---|
| 3537 |
|
---|
| 3538 | By default, the parsing of input records is as simple and fast as possible.
|
---|
| 3539 | However, some parsing information - like quotation of the original field -
|
---|
| 3540 | is lost in that process. Setting this flag to true enables retrieving that
|
---|
| 3541 | information after parsing with the methods L</meta_info>, L</is_quoted>,
|
---|
| 3542 | and L</is_binary> described below. Default is false for performance.
|
---|
| 3543 |
|
---|
| 3544 | If you set this attribute to a value greater than 9, than you can control
|
---|
| 3545 | output quotation style like it was used in the input of the the last parsed
|
---|
| 3546 | record (unless quotation was added because of other reasons).
|
---|
| 3547 |
|
---|
| 3548 | my $csv = Text::CSV_PP->new ({
|
---|
| 3549 | binary => 1,
|
---|
| 3550 | keep_meta_info => 1,
|
---|
| 3551 | quote_space => 0,
|
---|
| 3552 | });
|
---|
| 3553 |
|
---|
| 3554 | my $row = $csv->parse (q{1,,"", ," ",f,"g","h""h",help,"help"});
|
---|
| 3555 |
|
---|
| 3556 | $csv->print (*STDOUT, \@row);
|
---|
| 3557 | # 1,,, , ,f,g,"h""h",help,help
|
---|
| 3558 | $csv->keep_meta_info (11);
|
---|
| 3559 | $csv->print (*STDOUT, \@row);
|
---|
| 3560 | # 1,,"", ," ",f,"g","h""h",help,"help"
|
---|
| 3561 |
|
---|
| 3562 | =head3 undef_str
|
---|
| 3563 |
|
---|
| 3564 | my $csv = Text::CSV_PP->new ({ undef_str => "\\N" });
|
---|
| 3565 | $csv->undef_str (undef);
|
---|
| 3566 | my $s = $csv->undef_str;
|
---|
| 3567 |
|
---|
| 3568 | This attribute optionally defines the output of undefined fields. The value
|
---|
| 3569 | passed is not changed at all, so if it needs quotation, the quotation needs
|
---|
| 3570 | to be included in the value of the attribute. Use with caution, as passing
|
---|
| 3571 | a value like C<",",,,,"""> will for sure mess up your output. The default
|
---|
| 3572 | for this attribute is C<undef>, meaning no special treatment.
|
---|
| 3573 |
|
---|
| 3574 | This attribute is useful when exporting CSV data to be imported in custom
|
---|
| 3575 | loaders, like for MySQL, that recognize special sequences for C<NULL> data.
|
---|
| 3576 |
|
---|
| 3577 | This attribute has no meaning when parsing CSV data.
|
---|
| 3578 |
|
---|
| 3579 | =head3 verbatim
|
---|
| 3580 |
|
---|
| 3581 | my $csv = Text::CSV_PP->new ({ verbatim => 1 });
|
---|
| 3582 | $csv->verbatim (0);
|
---|
| 3583 | my $f = $csv->verbatim;
|
---|
| 3584 |
|
---|
| 3585 | This is a quite controversial attribute to set, but makes some hard things
|
---|
| 3586 | possible.
|
---|
| 3587 |
|
---|
| 3588 | The rationale behind this attribute is to tell the parser that the normally
|
---|
| 3589 | special characters newline (C<NL>) and Carriage Return (C<CR>) will not be
|
---|
| 3590 | special when this flag is set, and be dealt with as being ordinary binary
|
---|
| 3591 | characters. This will ease working with data with embedded newlines.
|
---|
| 3592 |
|
---|
| 3593 | When C<verbatim> is used with L</getline>, L</getline> auto-C<chomp>'s
|
---|
| 3594 | every line.
|
---|
| 3595 |
|
---|
| 3596 | Imagine a file format like
|
---|
| 3597 |
|
---|
| 3598 | M^^Hans^Janssen^Klas 2\n2A^Ja^11-06-2007#\r\n
|
---|
| 3599 |
|
---|
| 3600 | where, the line ending is a very specific C<"#\r\n">, and the sep_char is a
|
---|
| 3601 | C<^> (caret). None of the fields is quoted, but embedded binary data is
|
---|
| 3602 | likely to be present. With the specific line ending, this should not be too
|
---|
| 3603 | hard to detect.
|
---|
| 3604 |
|
---|
| 3605 | By default, Text::CSV_PP' parse function is instructed to only know about
|
---|
| 3606 | C<"\n"> and C<"\r"> to be legal line endings, and so has to deal with the
|
---|
| 3607 | embedded newline as a real C<end-of-line>, so it can scan the next line if
|
---|
| 3608 | binary is true, and the newline is inside a quoted field. With this option,
|
---|
| 3609 | we tell L</parse> to parse the line as if C<"\n"> is just nothing more than
|
---|
| 3610 | a binary character.
|
---|
| 3611 |
|
---|
| 3612 | For L</parse> this means that the parser has no more idea about line ending
|
---|
| 3613 | and L</getline> C<chomp>s line endings on reading.
|
---|
| 3614 |
|
---|
| 3615 | =head3 types
|
---|
| 3616 |
|
---|
| 3617 | A set of column types; the attribute is immediately passed to the L</types>
|
---|
| 3618 | method.
|
---|
| 3619 |
|
---|
| 3620 | =head3 callbacks
|
---|
| 3621 |
|
---|
| 3622 | See the L</Callbacks> section below.
|
---|
| 3623 |
|
---|
| 3624 | =head3 accessors
|
---|
| 3625 |
|
---|
| 3626 | To sum it up,
|
---|
| 3627 |
|
---|
| 3628 | $csv = Text::CSV_PP->new ();
|
---|
| 3629 |
|
---|
| 3630 | is equivalent to
|
---|
| 3631 |
|
---|
| 3632 | $csv = Text::CSV_PP->new ({
|
---|
| 3633 | eol => undef, # \r, \n, or \r\n
|
---|
| 3634 | sep_char => ',',
|
---|
| 3635 | sep => undef,
|
---|
| 3636 | quote_char => '"',
|
---|
| 3637 | quote => undef,
|
---|
| 3638 | escape_char => '"',
|
---|
| 3639 | binary => 0,
|
---|
| 3640 | decode_utf8 => 1,
|
---|
| 3641 | auto_diag => 0,
|
---|
| 3642 | diag_verbose => 0,
|
---|
| 3643 | blank_is_undef => 0,
|
---|
| 3644 | empty_is_undef => 0,
|
---|
| 3645 | allow_whitespace => 0,
|
---|
| 3646 | allow_loose_quotes => 0,
|
---|
| 3647 | allow_loose_escapes => 0,
|
---|
| 3648 | allow_unquoted_escape => 0,
|
---|
| 3649 | always_quote => 0,
|
---|
| 3650 | quote_empty => 0,
|
---|
| 3651 | quote_space => 1,
|
---|
| 3652 | escape_null => 1,
|
---|
| 3653 | quote_binary => 1,
|
---|
| 3654 | keep_meta_info => 0,
|
---|
| 3655 | strict => 0,
|
---|
| 3656 | formula => 0,
|
---|
| 3657 | verbatim => 0,
|
---|
| 3658 | undef_str => undef,
|
---|
| 3659 | types => undef,
|
---|
| 3660 | callbacks => undef,
|
---|
| 3661 | });
|
---|
| 3662 |
|
---|
| 3663 | For all of the above mentioned flags, an accessor method is available where
|
---|
| 3664 | you can inquire the current value, or change the value
|
---|
| 3665 |
|
---|
| 3666 | my $quote = $csv->quote_char;
|
---|
| 3667 | $csv->binary (1);
|
---|
| 3668 |
|
---|
| 3669 | It is not wise to change these settings halfway through writing C<CSV> data
|
---|
| 3670 | to a stream. If however you want to create a new stream using the available
|
---|
| 3671 | C<CSV> object, there is no harm in changing them.
|
---|
| 3672 |
|
---|
| 3673 | If the L</new> constructor call fails, it returns C<undef>, and makes the
|
---|
| 3674 | fail reason available through the L</error_diag> method.
|
---|
| 3675 |
|
---|
| 3676 | $csv = Text::CSV_PP->new ({ ecs_char => 1 }) or
|
---|
| 3677 | die "".Text::CSV_PP->error_diag ();
|
---|
| 3678 |
|
---|
| 3679 | L</error_diag> will return a string like
|
---|
| 3680 |
|
---|
| 3681 | "INI - Unknown attribute 'ecs_char'"
|
---|
| 3682 |
|
---|
| 3683 | =head2 known_attributes
|
---|
| 3684 |
|
---|
| 3685 | @attr = Text::CSV_PP->known_attributes;
|
---|
| 3686 | @attr = Text::CSV_PP::known_attributes;
|
---|
| 3687 | @attr = $csv->known_attributes;
|
---|
| 3688 |
|
---|
| 3689 | This method will return an ordered list of all the supported attributes as
|
---|
| 3690 | described above. This can be useful for knowing what attributes are valid
|
---|
| 3691 | in classes that use or extend Text::CSV_PP.
|
---|
| 3692 |
|
---|
| 3693 | =head2 print
|
---|
| 3694 |
|
---|
| 3695 | $status = $csv->print ($fh, $colref);
|
---|
| 3696 |
|
---|
| 3697 | Similar to L</combine> + L</string> + L</print>, but much more efficient.
|
---|
| 3698 | It expects an array ref as input (not an array!) and the resulting string
|
---|
| 3699 | is not really created, but immediately written to the C<$fh> object,
|
---|
| 3700 | typically an IO handle or any other object that offers a L</print> method.
|
---|
| 3701 |
|
---|
| 3702 | For performance reasons C<print> does not create a result string, so all
|
---|
| 3703 | L</string>, L</status>, L</fields>, and L</error_input> methods will return
|
---|
| 3704 | undefined information after executing this method.
|
---|
| 3705 |
|
---|
| 3706 | If C<$colref> is C<undef> (explicit, not through a variable argument) and
|
---|
| 3707 | L</bind_columns> was used to specify fields to be printed, it is possible
|
---|
| 3708 | to make performance improvements, as otherwise data would have to be copied
|
---|
| 3709 | as arguments to the method call:
|
---|
| 3710 |
|
---|
| 3711 | $csv->bind_columns (\($foo, $bar));
|
---|
| 3712 | $status = $csv->print ($fh, undef);
|
---|
| 3713 |
|
---|
| 3714 | A short benchmark
|
---|
| 3715 |
|
---|
| 3716 | my @data = ("aa" .. "zz");
|
---|
| 3717 | $csv->bind_columns (\(@data));
|
---|
| 3718 |
|
---|
| 3719 | $csv->print ($fh, [ @data ]); # 11800 recs/sec
|
---|
| 3720 | $csv->print ($fh, \@data ); # 57600 recs/sec
|
---|
| 3721 | $csv->print ($fh, undef ); # 48500 recs/sec
|
---|
| 3722 |
|
---|
| 3723 | =head2 say
|
---|
| 3724 |
|
---|
| 3725 | $status = $csv->say ($fh, $colref);
|
---|
| 3726 |
|
---|
| 3727 | Like L<C<print>|/print>, but L<C<eol>|/eol> defaults to C<$\>.
|
---|
| 3728 |
|
---|
| 3729 | =head2 print_hr
|
---|
| 3730 |
|
---|
| 3731 | $csv->print_hr ($fh, $ref);
|
---|
| 3732 |
|
---|
| 3733 | Provides an easy way to print a C<$ref> (as fetched with L</getline_hr>)
|
---|
| 3734 | provided the column names are set with L</column_names>.
|
---|
| 3735 |
|
---|
| 3736 | It is just a wrapper method with basic parameter checks over
|
---|
| 3737 |
|
---|
| 3738 | $csv->print ($fh, [ map { $ref->{$_} } $csv->column_names ]);
|
---|
| 3739 |
|
---|
| 3740 | =head2 combine
|
---|
| 3741 |
|
---|
| 3742 | $status = $csv->combine (@fields);
|
---|
| 3743 |
|
---|
| 3744 | This method constructs a C<CSV> record from C<@fields>, returning success
|
---|
| 3745 | or failure. Failure can result from lack of arguments or an argument that
|
---|
| 3746 | contains an invalid character. Upon success, L</string> can be called to
|
---|
| 3747 | retrieve the resultant C<CSV> string. Upon failure, the value returned by
|
---|
| 3748 | L</string> is undefined and L</error_input> could be called to retrieve the
|
---|
| 3749 | invalid argument.
|
---|
| 3750 |
|
---|
| 3751 | =head2 string
|
---|
| 3752 |
|
---|
| 3753 | $line = $csv->string ();
|
---|
| 3754 |
|
---|
| 3755 | This method returns the input to L</parse> or the resultant C<CSV> string
|
---|
| 3756 | of L</combine>, whichever was called more recently.
|
---|
| 3757 |
|
---|
| 3758 | =head2 getline
|
---|
| 3759 |
|
---|
| 3760 | $colref = $csv->getline ($fh);
|
---|
| 3761 |
|
---|
| 3762 | This is the counterpart to L</print>, as L</parse> is the counterpart to
|
---|
| 3763 | L</combine>: it parses a row from the C<$fh> handle using the L</getline>
|
---|
| 3764 | method associated with C<$fh> and parses this row into an array ref. This
|
---|
| 3765 | array ref is returned by the function or C<undef> for failure. When C<$fh>
|
---|
| 3766 | does not support C<getline>, you are likely to hit errors.
|
---|
| 3767 |
|
---|
| 3768 | When fields are bound with L</bind_columns> the return value is a reference
|
---|
| 3769 | to an empty list.
|
---|
| 3770 |
|
---|
| 3771 | The L</string>, L</fields>, and L</status> methods are meaningless again.
|
---|
| 3772 |
|
---|
| 3773 | =head2 getline_all
|
---|
| 3774 |
|
---|
| 3775 | $arrayref = $csv->getline_all ($fh);
|
---|
| 3776 | $arrayref = $csv->getline_all ($fh, $offset);
|
---|
| 3777 | $arrayref = $csv->getline_all ($fh, $offset, $length);
|
---|
| 3778 |
|
---|
| 3779 | This will return a reference to a list of L<getline ($fh)|/getline> results.
|
---|
| 3780 | In this call, C<keep_meta_info> is disabled. If C<$offset> is negative, as
|
---|
| 3781 | with C<splice>, only the last C<abs ($offset)> records of C<$fh> are taken
|
---|
| 3782 | into consideration.
|
---|
| 3783 |
|
---|
| 3784 | Given a CSV file with 10 lines:
|
---|
| 3785 |
|
---|
| 3786 | lines call
|
---|
| 3787 | ----- ---------------------------------------------------------
|
---|
| 3788 | 0..9 $csv->getline_all ($fh) # all
|
---|
| 3789 | 0..9 $csv->getline_all ($fh, 0) # all
|
---|
| 3790 | 8..9 $csv->getline_all ($fh, 8) # start at 8
|
---|
| 3791 | - $csv->getline_all ($fh, 0, 0) # start at 0 first 0 rows
|
---|
| 3792 | 0..4 $csv->getline_all ($fh, 0, 5) # start at 0 first 5 rows
|
---|
| 3793 | 4..5 $csv->getline_all ($fh, 4, 2) # start at 4 first 2 rows
|
---|
| 3794 | 8..9 $csv->getline_all ($fh, -2) # last 2 rows
|
---|
| 3795 | 6..7 $csv->getline_all ($fh, -4, 2) # first 2 of last 4 rows
|
---|
| 3796 |
|
---|
| 3797 | =head2 getline_hr
|
---|
| 3798 |
|
---|
| 3799 | The L</getline_hr> and L</column_names> methods work together to allow you
|
---|
| 3800 | to have rows returned as hashrefs. You must call L</column_names> first to
|
---|
| 3801 | declare your column names.
|
---|
| 3802 |
|
---|
| 3803 | $csv->column_names (qw( code name price description ));
|
---|
| 3804 | $hr = $csv->getline_hr ($fh);
|
---|
| 3805 | print "Price for $hr->{name} is $hr->{price} EUR\n";
|
---|
| 3806 |
|
---|
| 3807 | L</getline_hr> will croak if called before L</column_names>.
|
---|
| 3808 |
|
---|
| 3809 | Note that L</getline_hr> creates a hashref for every row and will be much
|
---|
| 3810 | slower than the combined use of L</bind_columns> and L</getline> but still
|
---|
| 3811 | offering the same ease of use hashref inside the loop:
|
---|
| 3812 |
|
---|
| 3813 | my @cols = @{$csv->getline ($fh)};
|
---|
| 3814 | $csv->column_names (@cols);
|
---|
| 3815 | while (my $row = $csv->getline_hr ($fh)) {
|
---|
| 3816 | print $row->{price};
|
---|
| 3817 | }
|
---|
| 3818 |
|
---|
| 3819 | Could easily be rewritten to the much faster:
|
---|
| 3820 |
|
---|
| 3821 | my @cols = @{$csv->getline ($fh)};
|
---|
| 3822 | my $row = {};
|
---|
| 3823 | $csv->bind_columns (\@{$row}{@cols});
|
---|
| 3824 | while ($csv->getline ($fh)) {
|
---|
| 3825 | print $row->{price};
|
---|
| 3826 | }
|
---|
| 3827 |
|
---|
| 3828 | Your mileage may vary for the size of the data and the number of rows. With
|
---|
| 3829 | perl-5.14.2 the comparison for a 100_000 line file with 14 rows:
|
---|
| 3830 |
|
---|
| 3831 | Rate hashrefs getlines
|
---|
| 3832 | hashrefs 1.00/s -- -76%
|
---|
| 3833 | getlines 4.15/s 313% --
|
---|
| 3834 |
|
---|
| 3835 | =head2 getline_hr_all
|
---|
| 3836 |
|
---|
| 3837 | $arrayref = $csv->getline_hr_all ($fh);
|
---|
| 3838 | $arrayref = $csv->getline_hr_all ($fh, $offset);
|
---|
| 3839 | $arrayref = $csv->getline_hr_all ($fh, $offset, $length);
|
---|
| 3840 |
|
---|
| 3841 | This will return a reference to a list of L<getline_hr ($fh)|/getline_hr>
|
---|
| 3842 | results. In this call, L<C<keep_meta_info>|/keep_meta_info> is disabled.
|
---|
| 3843 |
|
---|
| 3844 | =head2 parse
|
---|
| 3845 |
|
---|
| 3846 | $status = $csv->parse ($line);
|
---|
| 3847 |
|
---|
| 3848 | This method decomposes a C<CSV> string into fields, returning success or
|
---|
| 3849 | failure. Failure can result from a lack of argument or the given C<CSV>
|
---|
| 3850 | string is improperly formatted. Upon success, L</fields> can be called to
|
---|
| 3851 | retrieve the decomposed fields. Upon failure calling L</fields> will return
|
---|
| 3852 | undefined data and L</error_input> can be called to retrieve the invalid
|
---|
| 3853 | argument.
|
---|
| 3854 |
|
---|
| 3855 | You may use the L</types> method for setting column types. See L</types>'
|
---|
| 3856 | description below.
|
---|
| 3857 |
|
---|
| 3858 | The C<$line> argument is supposed to be a simple scalar. Everything else is
|
---|
| 3859 | supposed to croak and set error 1500.
|
---|
| 3860 |
|
---|
| 3861 | =head2 fragment
|
---|
| 3862 |
|
---|
| 3863 | This function tries to implement RFC7111 (URI Fragment Identifiers for the
|
---|
| 3864 | text/csv Media Type) - http://tools.ietf.org/html/rfc7111
|
---|
| 3865 |
|
---|
| 3866 | my $AoA = $csv->fragment ($fh, $spec);
|
---|
| 3867 |
|
---|
| 3868 | In specifications, C<*> is used to specify the I<last> item, a dash (C<->)
|
---|
| 3869 | to indicate a range. All indices are C<1>-based: the first row or column
|
---|
| 3870 | has index C<1>. Selections can be combined with the semi-colon (C<;>).
|
---|
| 3871 |
|
---|
| 3872 | When using this method in combination with L</column_names>, the returned
|
---|
| 3873 | reference will point to a list of hashes instead of a list of lists. A
|
---|
| 3874 | disjointed cell-based combined selection might return rows with different
|
---|
| 3875 | number of columns making the use of hashes unpredictable.
|
---|
| 3876 |
|
---|
| 3877 | $csv->column_names ("Name", "Age");
|
---|
| 3878 | my $AoH = $csv->fragment ($fh, "col=3;8");
|
---|
| 3879 |
|
---|
| 3880 | If the L</after_parse> callback is active, it is also called on every line
|
---|
| 3881 | parsed and skipped before the fragment.
|
---|
| 3882 |
|
---|
| 3883 | =over 2
|
---|
| 3884 |
|
---|
| 3885 | =item row
|
---|
| 3886 |
|
---|
| 3887 | row=4
|
---|
| 3888 | row=5-7
|
---|
| 3889 | row=6-*
|
---|
| 3890 | row=1-2;4;6-*
|
---|
| 3891 |
|
---|
| 3892 | =item col
|
---|
| 3893 |
|
---|
| 3894 | col=2
|
---|
| 3895 | col=1-3
|
---|
| 3896 | col=4-*
|
---|
| 3897 | col=1-2;4;7-*
|
---|
| 3898 |
|
---|
| 3899 | =item cell
|
---|
| 3900 |
|
---|
| 3901 | In cell-based selection, the comma (C<,>) is used to pair row and column
|
---|
| 3902 |
|
---|
| 3903 | cell=4,1
|
---|
| 3904 |
|
---|
| 3905 | The range operator (C<->) using C<cell>s can be used to define top-left and
|
---|
| 3906 | bottom-right C<cell> location
|
---|
| 3907 |
|
---|
| 3908 | cell=3,1-4,6
|
---|
| 3909 |
|
---|
| 3910 | The C<*> is only allowed in the second part of a pair
|
---|
| 3911 |
|
---|
| 3912 | cell=3,2-*,2 # row 3 till end, only column 2
|
---|
| 3913 | cell=3,2-3,* # column 2 till end, only row 3
|
---|
| 3914 | cell=3,2-*,* # strip row 1 and 2, and column 1
|
---|
| 3915 |
|
---|
| 3916 | Cells and cell ranges may be combined with C<;>, possibly resulting in rows
|
---|
| 3917 | with different number of columns
|
---|
| 3918 |
|
---|
| 3919 | cell=1,1-2,2;3,3-4,4;1,4;4,1
|
---|
| 3920 |
|
---|
| 3921 | Disjointed selections will only return selected cells. The cells that are
|
---|
| 3922 | not specified will not be included in the returned set, not even as
|
---|
| 3923 | C<undef>. As an example given a C<CSV> like
|
---|
| 3924 |
|
---|
| 3925 | 11,12,13,...19
|
---|
| 3926 | 21,22,...28,29
|
---|
| 3927 | : :
|
---|
| 3928 | 91,...97,98,99
|
---|
| 3929 |
|
---|
| 3930 | with C<cell=1,1-2,2;3,3-4,4;1,4;4,1> will return:
|
---|
| 3931 |
|
---|
| 3932 | 11,12,14
|
---|
| 3933 | 21,22
|
---|
| 3934 | 33,34
|
---|
| 3935 | 41,43,44
|
---|
| 3936 |
|
---|
| 3937 | Overlapping cell-specs will return those cells only once, So
|
---|
| 3938 | C<cell=1,1-3,3;2,2-4,4;2,3;4,2> will return:
|
---|
| 3939 |
|
---|
| 3940 | 11,12,13
|
---|
| 3941 | 21,22,23,24
|
---|
| 3942 | 31,32,33,34
|
---|
| 3943 | 42,43,44
|
---|
| 3944 |
|
---|
| 3945 | =back
|
---|
| 3946 |
|
---|
| 3947 | L<RFC7111|http://tools.ietf.org/html/rfc7111> does B<not> allow different
|
---|
| 3948 | types of specs to be combined (either C<row> I<or> C<col> I<or> C<cell>).
|
---|
| 3949 | Passing an invalid fragment specification will croak and set error 2013.
|
---|
| 3950 |
|
---|
| 3951 | =head2 column_names
|
---|
| 3952 |
|
---|
| 3953 | Set the "keys" that will be used in the L</getline_hr> calls. If no keys
|
---|
| 3954 | (column names) are passed, it will return the current setting as a list.
|
---|
| 3955 |
|
---|
| 3956 | L</column_names> accepts a list of scalars (the column names) or a single
|
---|
| 3957 | array_ref, so you can pass the return value from L</getline> too:
|
---|
| 3958 |
|
---|
| 3959 | $csv->column_names ($csv->getline ($fh));
|
---|
| 3960 |
|
---|
| 3961 | L</column_names> does B<no> checking on duplicates at all, which might lead
|
---|
| 3962 | to unexpected results. Undefined entries will be replaced with the string
|
---|
| 3963 | C<"\cAUNDEF\cA">, so
|
---|
| 3964 |
|
---|
| 3965 | $csv->column_names (undef, "", "name", "name");
|
---|
| 3966 | $hr = $csv->getline_hr ($fh);
|
---|
| 3967 |
|
---|
| 3968 | Will set C<< $hr->{"\cAUNDEF\cA"} >> to the 1st field, C<< $hr->{""} >> to
|
---|
| 3969 | the 2nd field, and C<< $hr->{name} >> to the 4th field, discarding the 3rd
|
---|
| 3970 | field.
|
---|
| 3971 |
|
---|
| 3972 | L</column_names> croaks on invalid arguments.
|
---|
| 3973 |
|
---|
| 3974 | =head2 header
|
---|
| 3975 |
|
---|
| 3976 | This method does NOT work in perl-5.6.x
|
---|
| 3977 |
|
---|
| 3978 | Parse the CSV header and set L<C<sep>|/sep>, column_names and encoding.
|
---|
| 3979 |
|
---|
| 3980 | my @hdr = $csv->header ($fh);
|
---|
| 3981 | $csv->header ($fh, { sep_set => [ ";", ",", "|", "\t" ] });
|
---|
| 3982 | $csv->header ($fh, { detect_bom => 1, munge_column_names => "lc" });
|
---|
| 3983 |
|
---|
| 3984 | The first argument should be a file handle.
|
---|
| 3985 |
|
---|
| 3986 | This method resets some object properties, as it is supposed to be invoked
|
---|
| 3987 | only once per file or stream. It will leave attributes C<column_names> and
|
---|
| 3988 | C<bound_columns> alone of setting column names is disabled. Reading headers
|
---|
| 3989 | on previously process objects might fail on perl-5.8.0 and older.
|
---|
| 3990 |
|
---|
| 3991 | Assuming that the file opened for parsing has a header, and the header does
|
---|
| 3992 | not contain problematic characters like embedded newlines, read the first
|
---|
| 3993 | line from the open handle then auto-detect whether the header separates the
|
---|
| 3994 | column names with a character from the allowed separator list.
|
---|
| 3995 |
|
---|
| 3996 | If any of the allowed separators matches, and none of the I<other> allowed
|
---|
| 3997 | separators match, set L<C<sep>|/sep> to that separator for the current
|
---|
| 3998 | CSV_PP instance and use it to parse the first line, map those to lowercase,
|
---|
| 3999 | and use that to set the instance L</column_names>:
|
---|
| 4000 |
|
---|
| 4001 | my $csv = Text::CSV_PP->new ({ binary => 1, auto_diag => 1 });
|
---|
| 4002 | open my $fh, "<", "file.csv";
|
---|
| 4003 | binmode $fh; # for Windows
|
---|
| 4004 | $csv->header ($fh);
|
---|
| 4005 | while (my $row = $csv->getline_hr ($fh)) {
|
---|
| 4006 | ...
|
---|
| 4007 | }
|
---|
| 4008 |
|
---|
| 4009 | If the header is empty, contains more than one unique separator out of the
|
---|
| 4010 | allowed set, contains empty fields, or contains identical fields (after
|
---|
| 4011 | folding), it will croak with error 1010, 1011, 1012, or 1013 respectively.
|
---|
| 4012 |
|
---|
| 4013 | If the header contains embedded newlines or is not valid CSV in any other
|
---|
| 4014 | way, this method will croak and leave the parse error untouched.
|
---|
| 4015 |
|
---|
| 4016 | A successful call to C<header> will always set the L<C<sep>|/sep> of the
|
---|
| 4017 | C<$csv> object. This behavior can not be disabled.
|
---|
| 4018 |
|
---|
| 4019 | =head3 return value
|
---|
| 4020 |
|
---|
| 4021 | On error this method will croak.
|
---|
| 4022 |
|
---|
| 4023 | In list context, the headers will be returned whether they are used to set
|
---|
| 4024 | L</column_names> or not.
|
---|
| 4025 |
|
---|
| 4026 | In scalar context, the instance itself is returned. B<Note>: the values as
|
---|
| 4027 | found in the header will effectively be B<lost> if C<set_column_names> is
|
---|
| 4028 | false.
|
---|
| 4029 |
|
---|
| 4030 | =head3 Options
|
---|
| 4031 |
|
---|
| 4032 | =over 2
|
---|
| 4033 |
|
---|
| 4034 | =item sep_set
|
---|
| 4035 |
|
---|
| 4036 | $csv->header ($fh, { sep_set => [ ";", ",", "|", "\t" ] });
|
---|
| 4037 |
|
---|
| 4038 | The list of legal separators defaults to C<[ ";", "," ]> and can be changed
|
---|
| 4039 | by this option. As this is probably the most often used option, it can be
|
---|
| 4040 | passed on its own as an unnamed argument:
|
---|
| 4041 |
|
---|
| 4042 | $csv->header ($fh, [ ";", ",", "|", "\t", "::", "\x{2063}" ]);
|
---|
| 4043 |
|
---|
| 4044 | Multi-byte sequences are allowed, both multi-character and Unicode. See
|
---|
| 4045 | L<C<sep>|/sep>.
|
---|
| 4046 |
|
---|
| 4047 | =item detect_bom
|
---|
| 4048 |
|
---|
| 4049 | $csv->header ($fh, { detect_bom => 1 });
|
---|
| 4050 |
|
---|
| 4051 | The default behavior is to detect if the header line starts with a BOM. If
|
---|
| 4052 | the header has a BOM, use that to set the encoding of C<$fh>. This default
|
---|
| 4053 | behavior can be disabled by passing a false value to C<detect_bom>.
|
---|
| 4054 |
|
---|
| 4055 | Supported encodings from BOM are: UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, and
|
---|
| 4056 | UTF-32LE. BOM's also support UTF-1, UTF-EBCDIC, SCSU, BOCU-1, and GB-18030
|
---|
| 4057 | but L<Encode> does not (yet). UTF-7 is not supported.
|
---|
| 4058 |
|
---|
| 4059 | If a supported BOM was detected as start of the stream, it is stored in the
|
---|
| 4060 | abject attribute C<ENCODING>.
|
---|
| 4061 |
|
---|
| 4062 | my $enc = $csv->{ENCODING};
|
---|
| 4063 |
|
---|
| 4064 | The encoding is used with C<binmode> on C<$fh>.
|
---|
| 4065 |
|
---|
| 4066 | If the handle was opened in a (correct) encoding, this method will B<not>
|
---|
| 4067 | alter the encoding, as it checks the leading B<bytes> of the first line. In
|
---|
| 4068 | case the stream starts with a decode BOM (C<U+FEFF>), C<{ENCODING}> will be
|
---|
| 4069 | C<""> (empty) instead of the default C<undef>.
|
---|
| 4070 |
|
---|
| 4071 | =item munge_column_names
|
---|
| 4072 |
|
---|
| 4073 | This option offers the means to modify the column names into something that
|
---|
| 4074 | is most useful to the application. The default is to map all column names
|
---|
| 4075 | to lower case.
|
---|
| 4076 |
|
---|
| 4077 | $csv->header ($fh, { munge_column_names => "lc" });
|
---|
| 4078 |
|
---|
| 4079 | The following values are available:
|
---|
| 4080 |
|
---|
| 4081 | lc - lower case
|
---|
| 4082 | uc - upper case
|
---|
| 4083 | none - do not change
|
---|
| 4084 | \%hash - supply a mapping
|
---|
| 4085 | \&cb - supply a callback
|
---|
| 4086 |
|
---|
| 4087 | Literal:
|
---|
| 4088 |
|
---|
| 4089 | $csv->header ($fh, { munge_column_names => "none" });
|
---|
| 4090 |
|
---|
| 4091 | Hash:
|
---|
| 4092 |
|
---|
| 4093 | $csv->header ($fh, { munge_column_names => { foo => "sombrero" });
|
---|
| 4094 |
|
---|
| 4095 | if a value does not exist, the original value is used unchanged
|
---|
| 4096 |
|
---|
| 4097 | Callback:
|
---|
| 4098 |
|
---|
| 4099 | $csv->header ($fh, { munge_column_names => sub { fc } });
|
---|
| 4100 | $csv->header ($fh, { munge_column_names => sub { "column_".$col++ } });
|
---|
| 4101 | $csv->header ($fh, { munge_column_names => sub { lc (s/\W+/_/gr) } });
|
---|
| 4102 |
|
---|
| 4103 | As this callback is called in a C<map>, you can use C<$_> directly.
|
---|
| 4104 |
|
---|
| 4105 | =item set_column_names
|
---|
| 4106 |
|
---|
| 4107 | $csv->header ($fh, { set_column_names => 1 });
|
---|
| 4108 |
|
---|
| 4109 | The default is to set the instances column names using L</column_names> if
|
---|
| 4110 | the method is successful, so subsequent calls to L</getline_hr> can return
|
---|
| 4111 | a hash. Disable setting the header can be forced by using a false value for
|
---|
| 4112 | this option.
|
---|
| 4113 |
|
---|
| 4114 | As described in L</return value> above, content is lost in scalar context.
|
---|
| 4115 |
|
---|
| 4116 | =back
|
---|
| 4117 |
|
---|
| 4118 | =head3 Validation
|
---|
| 4119 |
|
---|
| 4120 | When receiving CSV files from external sources, this method can be used to
|
---|
| 4121 | protect against changes in the layout by restricting to known headers (and
|
---|
| 4122 | typos in the header fields).
|
---|
| 4123 |
|
---|
| 4124 | my %known = (
|
---|
| 4125 | "record key" => "c_rec",
|
---|
| 4126 | "rec id" => "c_rec",
|
---|
| 4127 | "id_rec" => "c_rec",
|
---|
| 4128 | "kode" => "code",
|
---|
| 4129 | "code" => "code",
|
---|
| 4130 | "vaule" => "value",
|
---|
| 4131 | "value" => "value",
|
---|
| 4132 | );
|
---|
| 4133 | my $csv = Text::CSV_PP->new ({ binary => 1, auto_diag => 1 });
|
---|
| 4134 | open my $fh, "<", $source or die "$source: $!";
|
---|
| 4135 | $csv->header ($fh, { munge_column_names => sub {
|
---|
| 4136 | s/\s+$//;
|
---|
| 4137 | s/^\s+//;
|
---|
| 4138 | $known{lc $_} or die "Unknown column '$_' in $source";
|
---|
| 4139 | }});
|
---|
| 4140 | while (my $row = $csv->getline_hr ($fh)) {
|
---|
| 4141 | say join "\t", $row->{c_rec}, $row->{code}, $row->{value};
|
---|
| 4142 | }
|
---|
| 4143 |
|
---|
| 4144 | =head2 bind_columns
|
---|
| 4145 |
|
---|
| 4146 | Takes a list of scalar references to be used for output with L</print> or
|
---|
| 4147 | to store in the fields fetched by L</getline>. When you do not pass enough
|
---|
| 4148 | references to store the fetched fields in, L</getline> will fail with error
|
---|
| 4149 | C<3006>. If you pass more than there are fields to return, the content of
|
---|
| 4150 | the remaining references is left untouched.
|
---|
| 4151 |
|
---|
| 4152 | $csv->bind_columns (\$code, \$name, \$price, \$description);
|
---|
| 4153 | while ($csv->getline ($fh)) {
|
---|
| 4154 | print "The price of a $name is \x{20ac} $price\n";
|
---|
| 4155 | }
|
---|
| 4156 |
|
---|
| 4157 | To reset or clear all column binding, call L</bind_columns> with the single
|
---|
| 4158 | argument C<undef>. This will also clear column names.
|
---|
| 4159 |
|
---|
| 4160 | $csv->bind_columns (undef);
|
---|
| 4161 |
|
---|
| 4162 | If no arguments are passed at all, L</bind_columns> will return the list of
|
---|
| 4163 | current bindings or C<undef> if no binds are active.
|
---|
| 4164 |
|
---|
| 4165 | Note that in parsing with C<bind_columns>, the fields are set on the fly.
|
---|
| 4166 | That implies that if the third field of a row causes an error (or this row
|
---|
| 4167 | has just two fields where the previous row had more), the first two fields
|
---|
| 4168 | already have been assigned the values of the current row, while the rest of
|
---|
| 4169 | the fields will still hold the values of the previous row. If you want the
|
---|
| 4170 | parser to fail in these cases, use the L<C<strict>|/strict> attribute.
|
---|
| 4171 |
|
---|
| 4172 | =head2 eof
|
---|
| 4173 |
|
---|
| 4174 | $eof = $csv->eof ();
|
---|
| 4175 |
|
---|
| 4176 | If L</parse> or L</getline> was used with an IO stream, this method will
|
---|
| 4177 | return true (1) if the last call hit end of file, otherwise it will return
|
---|
| 4178 | false (''). This is useful to see the difference between a failure and end
|
---|
| 4179 | of file.
|
---|
| 4180 |
|
---|
| 4181 | Note that if the parsing of the last line caused an error, C<eof> is still
|
---|
| 4182 | true. That means that if you are I<not> using L</auto_diag>, an idiom like
|
---|
| 4183 |
|
---|
| 4184 | while (my $row = $csv->getline ($fh)) {
|
---|
| 4185 | # ...
|
---|
| 4186 | }
|
---|
| 4187 | $csv->eof or $csv->error_diag;
|
---|
| 4188 |
|
---|
| 4189 | will I<not> report the error. You would have to change that to
|
---|
| 4190 |
|
---|
| 4191 | while (my $row = $csv->getline ($fh)) {
|
---|
| 4192 | # ...
|
---|
| 4193 | }
|
---|
| 4194 | +$csv->error_diag and $csv->error_diag;
|
---|
| 4195 |
|
---|
| 4196 | =head2 types
|
---|
| 4197 |
|
---|
| 4198 | $csv->types (\@tref);
|
---|
| 4199 |
|
---|
| 4200 | This method is used to force that (all) columns are of a given type. For
|
---|
| 4201 | example, if you have an integer column, two columns with doubles and a
|
---|
| 4202 | string column, then you might do a
|
---|
| 4203 |
|
---|
| 4204 | $csv->types ([Text::CSV_PP::IV (),
|
---|
| 4205 | Text::CSV_PP::NV (),
|
---|
| 4206 | Text::CSV_PP::NV (),
|
---|
| 4207 | Text::CSV_PP::PV ()]);
|
---|
| 4208 |
|
---|
| 4209 | Column types are used only for I<decoding> columns while parsing, in other
|
---|
| 4210 | words by the L</parse> and L</getline> methods.
|
---|
| 4211 |
|
---|
| 4212 | You can unset column types by doing a
|
---|
| 4213 |
|
---|
| 4214 | $csv->types (undef);
|
---|
| 4215 |
|
---|
| 4216 | or fetch the current type settings with
|
---|
| 4217 |
|
---|
| 4218 | $types = $csv->types ();
|
---|
| 4219 |
|
---|
| 4220 | =over 4
|
---|
| 4221 |
|
---|
| 4222 | =item IV
|
---|
| 4223 |
|
---|
| 4224 | Set field type to integer.
|
---|
| 4225 |
|
---|
| 4226 | =item NV
|
---|
| 4227 |
|
---|
| 4228 | Set field type to numeric/float.
|
---|
| 4229 |
|
---|
| 4230 | =item PV
|
---|
| 4231 |
|
---|
| 4232 | Set field type to string.
|
---|
| 4233 |
|
---|
| 4234 | =back
|
---|
| 4235 |
|
---|
| 4236 | =head2 fields
|
---|
| 4237 |
|
---|
| 4238 | @columns = $csv->fields ();
|
---|
| 4239 |
|
---|
| 4240 | This method returns the input to L</combine> or the resultant decomposed
|
---|
| 4241 | fields of a successful L</parse>, whichever was called more recently.
|
---|
| 4242 |
|
---|
| 4243 | Note that the return value is undefined after using L</getline>, which does
|
---|
| 4244 | not fill the data structures returned by L</parse>.
|
---|
| 4245 |
|
---|
| 4246 | =head2 meta_info
|
---|
| 4247 |
|
---|
| 4248 | @flags = $csv->meta_info ();
|
---|
| 4249 |
|
---|
| 4250 | This method returns the "flags" of the input to L</combine> or the flags of
|
---|
| 4251 | the resultant decomposed fields of L</parse>, whichever was called more
|
---|
| 4252 | recently.
|
---|
| 4253 |
|
---|
| 4254 | For each field, a meta_info field will hold flags that inform something
|
---|
| 4255 | about the field returned by the L</fields> method or passed to the
|
---|
| 4256 | L</combine> method. The flags are bit-wise-C<or>'d like:
|
---|
| 4257 |
|
---|
| 4258 | =over 2
|
---|
| 4259 |
|
---|
| 4260 | =item C< >0x0001
|
---|
| 4261 |
|
---|
| 4262 | The field was quoted.
|
---|
| 4263 |
|
---|
| 4264 | =item C< >0x0002
|
---|
| 4265 |
|
---|
| 4266 | The field was binary.
|
---|
| 4267 |
|
---|
| 4268 | =back
|
---|
| 4269 |
|
---|
| 4270 | See the C<is_***> methods below.
|
---|
| 4271 |
|
---|
| 4272 | =head2 is_quoted
|
---|
| 4273 |
|
---|
| 4274 | my $quoted = $csv->is_quoted ($column_idx);
|
---|
| 4275 |
|
---|
| 4276 | Where C<$column_idx> is the (zero-based) index of the column in the last
|
---|
| 4277 | result of L</parse>.
|
---|
| 4278 |
|
---|
| 4279 | This returns a true value if the data in the indicated column was enclosed
|
---|
| 4280 | in L<C<quote_char>|/quote_char> quotes. This might be important for fields
|
---|
| 4281 | where content C<,20070108,> is to be treated as a numeric value, and where
|
---|
| 4282 | C<,"20070108",> is explicitly marked as character string data.
|
---|
| 4283 |
|
---|
| 4284 | This method is only valid when L</keep_meta_info> is set to a true value.
|
---|
| 4285 |
|
---|
| 4286 | =head2 is_binary
|
---|
| 4287 |
|
---|
| 4288 | my $binary = $csv->is_binary ($column_idx);
|
---|
| 4289 |
|
---|
| 4290 | Where C<$column_idx> is the (zero-based) index of the column in the last
|
---|
| 4291 | result of L</parse>.
|
---|
| 4292 |
|
---|
| 4293 | This returns a true value if the data in the indicated column contained any
|
---|
| 4294 | byte in the range C<[\x00-\x08,\x10-\x1F,\x7F-\xFF]>.
|
---|
| 4295 |
|
---|
| 4296 | This method is only valid when L</keep_meta_info> is set to a true value.
|
---|
| 4297 |
|
---|
| 4298 | =head2 is_missing
|
---|
| 4299 |
|
---|
| 4300 | my $missing = $csv->is_missing ($column_idx);
|
---|
| 4301 |
|
---|
| 4302 | Where C<$column_idx> is the (zero-based) index of the column in the last
|
---|
| 4303 | result of L</getline_hr>.
|
---|
| 4304 |
|
---|
| 4305 | $csv->keep_meta_info (1);
|
---|
| 4306 | while (my $hr = $csv->getline_hr ($fh)) {
|
---|
| 4307 | $csv->is_missing (0) and next; # This was an empty line
|
---|
| 4308 | }
|
---|
| 4309 |
|
---|
| 4310 | When using L</getline_hr>, it is impossible to tell if the parsed fields
|
---|
| 4311 | are C<undef> because they where not filled in the C<CSV> stream or because
|
---|
| 4312 | they were not read at all, as B<all> the fields defined by L</column_names>
|
---|
| 4313 | are set in the hash-ref. If you still need to know if all fields in each
|
---|
| 4314 | row are provided, you should enable L<C<keep_meta_info>|/keep_meta_info> so
|
---|
| 4315 | you can check the flags.
|
---|
| 4316 |
|
---|
| 4317 | If L<C<keep_meta_info>|/keep_meta_info> is C<false>, C<is_missing> will
|
---|
| 4318 | always return C<undef>, regardless of C<$column_idx> being valid or not. If
|
---|
| 4319 | this attribute is C<true> it will return either C<0> (the field is present)
|
---|
| 4320 | or C<1> (the field is missing).
|
---|
| 4321 |
|
---|
| 4322 | A special case is the empty line. If the line is completely empty - after
|
---|
| 4323 | dealing with the flags - this is still a valid CSV line: it is a record of
|
---|
| 4324 | just one single empty field. However, if C<keep_meta_info> is set, invoking
|
---|
| 4325 | C<is_missing> with index C<0> will now return true.
|
---|
| 4326 |
|
---|
| 4327 | =head2 status
|
---|
| 4328 |
|
---|
| 4329 | $status = $csv->status ();
|
---|
| 4330 |
|
---|
| 4331 | This method returns the status of the last invoked L</combine> or L</parse>
|
---|
| 4332 | call. Status is success (true: C<1>) or failure (false: C<undef> or C<0>).
|
---|
| 4333 |
|
---|
| 4334 | =head2 error_input
|
---|
| 4335 |
|
---|
| 4336 | $bad_argument = $csv->error_input ();
|
---|
| 4337 |
|
---|
| 4338 | This method returns the erroneous argument (if it exists) of L</combine> or
|
---|
| 4339 | L</parse>, whichever was called more recently. If the last invocation was
|
---|
| 4340 | successful, C<error_input> will return C<undef>.
|
---|
| 4341 |
|
---|
| 4342 | =head2 error_diag
|
---|
| 4343 |
|
---|
| 4344 | Text::CSV_PP->error_diag ();
|
---|
| 4345 | $csv->error_diag ();
|
---|
| 4346 | $error_code = 0 + $csv->error_diag ();
|
---|
| 4347 | $error_str = "" . $csv->error_diag ();
|
---|
| 4348 | ($cde, $str, $pos, $rec, $fld) = $csv->error_diag ();
|
---|
| 4349 |
|
---|
| 4350 | If (and only if) an error occurred, this function returns the diagnostics
|
---|
| 4351 | of that error.
|
---|
| 4352 |
|
---|
| 4353 | If called in void context, this will print the internal error code and the
|
---|
| 4354 | associated error message to STDERR.
|
---|
| 4355 |
|
---|
| 4356 | If called in list context, this will return the error code and the error
|
---|
| 4357 | message in that order. If the last error was from parsing, the rest of the
|
---|
| 4358 | values returned are a best guess at the location within the line that was
|
---|
| 4359 | being parsed. Their values are 1-based. The position currently is index of
|
---|
| 4360 | the byte at which the parsing failed in the current record. It might change
|
---|
| 4361 | to be the index of the current character in a later release. The records is
|
---|
| 4362 | the index of the record parsed by the csv instance. The field number is the
|
---|
| 4363 | index of the field the parser thinks it is currently trying to parse. See
|
---|
| 4364 | F<examples/csv-check> for how this can be used.
|
---|
| 4365 |
|
---|
| 4366 | If called in scalar context, it will return the diagnostics in a single
|
---|
| 4367 | scalar, a-la C<$!>. It will contain the error code in numeric context, and
|
---|
| 4368 | the diagnostics message in string context.
|
---|
| 4369 |
|
---|
| 4370 | When called as a class method or a direct function call, the diagnostics
|
---|
| 4371 | are that of the last L</new> call.
|
---|
| 4372 |
|
---|
| 4373 | =head2 record_number
|
---|
| 4374 |
|
---|
| 4375 | $recno = $csv->record_number ();
|
---|
| 4376 |
|
---|
| 4377 | Returns the records parsed by this csv instance. This value should be more
|
---|
| 4378 | accurate than C<$.> when embedded newlines come in play. Records written by
|
---|
| 4379 | this instance are not counted.
|
---|
| 4380 |
|
---|
| 4381 | =head2 SetDiag
|
---|
| 4382 |
|
---|
| 4383 | $csv->SetDiag (0);
|
---|
| 4384 |
|
---|
| 4385 | Use to reset the diagnostics if you are dealing with errors.
|
---|
| 4386 |
|
---|
| 4387 | =head1 FUNCTIONS
|
---|
| 4388 |
|
---|
| 4389 | This section is also taken from Text::CSV_XS.
|
---|
| 4390 |
|
---|
| 4391 | =head2 csv
|
---|
| 4392 |
|
---|
| 4393 | This function is not exported by default and should be explicitly requested:
|
---|
| 4394 |
|
---|
| 4395 | use Text::CSV_PP qw( csv );
|
---|
| 4396 |
|
---|
| 4397 | This is an high-level function that aims at simple (user) interfaces. This
|
---|
| 4398 | can be used to read/parse a C<CSV> file or stream (the default behavior) or
|
---|
| 4399 | to produce a file or write to a stream (define the C<out> attribute). It
|
---|
| 4400 | returns an array- or hash-reference on parsing (or C<undef> on fail) or the
|
---|
| 4401 | numeric value of L</error_diag> on writing. When this function fails you
|
---|
| 4402 | can get to the error using the class call to L</error_diag>
|
---|
| 4403 |
|
---|
| 4404 | my $aoa = csv (in => "test.csv") or
|
---|
| 4405 | die Text::CSV_PP->error_diag;
|
---|
| 4406 |
|
---|
| 4407 | This function takes the arguments as key-value pairs. This can be passed as
|
---|
| 4408 | a list or as an anonymous hash:
|
---|
| 4409 |
|
---|
| 4410 | my $aoa = csv ( in => "test.csv", sep_char => ";");
|
---|
| 4411 | my $aoh = csv ({ in => $fh, headers => "auto" });
|
---|
| 4412 |
|
---|
| 4413 | The arguments passed consist of two parts: the arguments to L</csv> itself
|
---|
| 4414 | and the optional attributes to the C<CSV> object used inside the function
|
---|
| 4415 | as enumerated and explained in L</new>.
|
---|
| 4416 |
|
---|
| 4417 | If not overridden, the default option used for CSV is
|
---|
| 4418 |
|
---|
| 4419 | auto_diag => 1
|
---|
| 4420 | escape_null => 0
|
---|
| 4421 |
|
---|
| 4422 | The option that is always set and cannot be altered is
|
---|
| 4423 |
|
---|
| 4424 | binary => 1
|
---|
| 4425 |
|
---|
| 4426 | As this function will likely be used in one-liners, it allows C<quote> to
|
---|
| 4427 | be abbreviated as C<quo>, and C<escape_char> to be abbreviated as C<esc>
|
---|
| 4428 | or C<escape>.
|
---|
| 4429 |
|
---|
| 4430 | Alternative invocations:
|
---|
| 4431 |
|
---|
| 4432 | my $aoa = Text::CSV_PP::csv (in => "file.csv");
|
---|
| 4433 |
|
---|
| 4434 | my $csv = Text::CSV_PP->new ();
|
---|
| 4435 | my $aoa = $csv->csv (in => "file.csv");
|
---|
| 4436 |
|
---|
| 4437 | In the latter case, the object attributes are used from the existing object
|
---|
| 4438 | and the attribute arguments in the function call are ignored:
|
---|
| 4439 |
|
---|
| 4440 | my $csv = Text::CSV_PP->new ({ sep_char => ";" });
|
---|
| 4441 | my $aoh = $csv->csv (in => "file.csv", bom => 1);
|
---|
| 4442 |
|
---|
| 4443 | will parse using C<;> as C<sep_char>, not C<,>.
|
---|
| 4444 |
|
---|
| 4445 | =head3 in
|
---|
| 4446 |
|
---|
| 4447 | Used to specify the source. C<in> can be a file name (e.g. C<"file.csv">),
|
---|
| 4448 | which will be opened for reading and closed when finished, a file handle
|
---|
| 4449 | (e.g. C<$fh> or C<FH>), a reference to a glob (e.g. C<\*ARGV>), the glob
|
---|
| 4450 | itself (e.g. C<*STDIN>), or a reference to a scalar (e.g. C<\q{1,2,"csv"}>).
|
---|
| 4451 |
|
---|
| 4452 | When used with L</out>, C<in> should be a reference to a CSV structure (AoA
|
---|
| 4453 | or AoH) or a CODE-ref that returns an array-reference or a hash-reference.
|
---|
| 4454 | The code-ref will be invoked with no arguments.
|
---|
| 4455 |
|
---|
| 4456 | my $aoa = csv (in => "file.csv");
|
---|
| 4457 |
|
---|
| 4458 | open my $fh, "<", "file.csv";
|
---|
| 4459 | my $aoa = csv (in => $fh);
|
---|
| 4460 |
|
---|
| 4461 | my $csv = [ [qw( Foo Bar )], [ 1, 2 ], [ 2, 3 ]];
|
---|
| 4462 | my $err = csv (in => $csv, out => "file.csv");
|
---|
| 4463 |
|
---|
| 4464 | If called in void context without the L</out> attribute, the resulting ref
|
---|
| 4465 | will be used as input to a subsequent call to csv:
|
---|
| 4466 |
|
---|
| 4467 | csv (in => "file.csv", filter => { 2 => sub { length > 2 }})
|
---|
| 4468 |
|
---|
| 4469 | will be a shortcut to
|
---|
| 4470 |
|
---|
| 4471 | csv (in => csv (in => "file.csv", filter => { 2 => sub { length > 2 }}))
|
---|
| 4472 |
|
---|
| 4473 | where, in the absence of the C<out> attribute, this is a shortcut to
|
---|
| 4474 |
|
---|
| 4475 | csv (in => csv (in => "file.csv", filter => { 2 => sub { length > 2 }}),
|
---|
| 4476 | out => *STDOUT)
|
---|
| 4477 |
|
---|
| 4478 | =head3 out
|
---|
| 4479 |
|
---|
| 4480 | csv (in => $aoa, out => "file.csv");
|
---|
| 4481 | csv (in => $aoa, out => $fh);
|
---|
| 4482 | csv (in => $aoa, out => STDOUT);
|
---|
| 4483 | csv (in => $aoa, out => *STDOUT);
|
---|
| 4484 | csv (in => $aoa, out => \*STDOUT);
|
---|
| 4485 | csv (in => $aoa, out => \my $data);
|
---|
| 4486 | csv (in => $aoa, out => undef);
|
---|
| 4487 | csv (in => $aoa, out => \"skip");
|
---|
| 4488 |
|
---|
| 4489 | In output mode, the default CSV options when producing CSV are
|
---|
| 4490 |
|
---|
| 4491 | eol => "\r\n"
|
---|
| 4492 |
|
---|
| 4493 | The L</fragment> attribute is ignored in output mode.
|
---|
| 4494 |
|
---|
| 4495 | C<out> can be a file name (e.g. C<"file.csv">), which will be opened for
|
---|
| 4496 | writing and closed when finished, a file handle (e.g. C<$fh> or C<FH>), a
|
---|
| 4497 | reference to a glob (e.g. C<\*STDOUT>), the glob itself (e.g. C<*STDOUT>),
|
---|
| 4498 | or a reference to a scalar (e.g. C<\my $data>).
|
---|
| 4499 |
|
---|
| 4500 | csv (in => sub { $sth->fetch }, out => "dump.csv");
|
---|
| 4501 | csv (in => sub { $sth->fetchrow_hashref }, out => "dump.csv",
|
---|
| 4502 | headers => $sth->{NAME_lc});
|
---|
| 4503 |
|
---|
| 4504 | When a code-ref is used for C<in>, the output is generated per invocation,
|
---|
| 4505 | so no buffering is involved. This implies that there is no size restriction
|
---|
| 4506 | on the number of records. The C<csv> function ends when the coderef returns
|
---|
| 4507 | a false value.
|
---|
| 4508 |
|
---|
| 4509 | If C<out> is set to a reference of the literal string C<"skip">, the output
|
---|
| 4510 | will be suppressed completely, which might be useful in combination with a
|
---|
| 4511 | filter for side effects only.
|
---|
| 4512 |
|
---|
| 4513 | my %cache;
|
---|
| 4514 | csv (in => "dump.csv",
|
---|
| 4515 | out => \"skip",
|
---|
| 4516 | on_in => sub { $cache{$_[1][1]}++ });
|
---|
| 4517 |
|
---|
| 4518 | Currently, setting C<out> to any false value (C<undef>, C<"">, 0) will be
|
---|
| 4519 | equivalent to C<\"skip">.
|
---|
| 4520 |
|
---|
| 4521 | =head3 encoding
|
---|
| 4522 |
|
---|
| 4523 | If passed, it should be an encoding accepted by the C<:encoding()> option
|
---|
| 4524 | to C<open>. There is no default value. This attribute does not work in perl
|
---|
| 4525 | 5.6.x. C<encoding> can be abbreviated to C<enc> for ease of use in command
|
---|
| 4526 | line invocations.
|
---|
| 4527 |
|
---|
| 4528 | If C<encoding> is set to the literal value C<"auto">, the method L</header>
|
---|
| 4529 | will be invoked on the opened stream to check if there is a BOM and set the
|
---|
| 4530 | encoding accordingly. This is equal to passing a true value in the option
|
---|
| 4531 | L<C<detect_bom>|/detect_bom>.
|
---|
| 4532 |
|
---|
| 4533 | =head3 detect_bom
|
---|
| 4534 |
|
---|
| 4535 | If C<detect_bom> is given, the method L</header> will be invoked on the
|
---|
| 4536 | opened stream to check if there is a BOM and set the encoding accordingly.
|
---|
| 4537 |
|
---|
| 4538 | C<detect_bom> can be abbreviated to C<bom>.
|
---|
| 4539 |
|
---|
| 4540 | This is the same as setting L<C<encoding>|/encoding> to C<"auto">.
|
---|
| 4541 |
|
---|
| 4542 | Note that as the method L</header> is invoked, its default is to also set
|
---|
| 4543 | the headers.
|
---|
| 4544 |
|
---|
| 4545 | =head3 headers
|
---|
| 4546 |
|
---|
| 4547 | If this attribute is not given, the default behavior is to produce an array
|
---|
| 4548 | of arrays.
|
---|
| 4549 |
|
---|
| 4550 | If C<headers> is supplied, it should be an anonymous list of column names,
|
---|
| 4551 | an anonymous hashref, a coderef, or a literal flag: C<auto>, C<lc>, C<uc>,
|
---|
| 4552 | or C<skip>.
|
---|
| 4553 |
|
---|
| 4554 | =over 2
|
---|
| 4555 |
|
---|
| 4556 | =item skip
|
---|
| 4557 |
|
---|
| 4558 | When C<skip> is used, the header will not be included in the output.
|
---|
| 4559 |
|
---|
| 4560 | my $aoa = csv (in => $fh, headers => "skip");
|
---|
| 4561 |
|
---|
| 4562 | =item auto
|
---|
| 4563 |
|
---|
| 4564 | If C<auto> is used, the first line of the C<CSV> source will be read as the
|
---|
| 4565 | list of field headers and used to produce an array of hashes.
|
---|
| 4566 |
|
---|
| 4567 | my $aoh = csv (in => $fh, headers => "auto");
|
---|
| 4568 |
|
---|
| 4569 | =item lc
|
---|
| 4570 |
|
---|
| 4571 | If C<lc> is used, the first line of the C<CSV> source will be read as the
|
---|
| 4572 | list of field headers mapped to lower case and used to produce an array of
|
---|
| 4573 | hashes. This is a variation of C<auto>.
|
---|
| 4574 |
|
---|
| 4575 | my $aoh = csv (in => $fh, headers => "lc");
|
---|
| 4576 |
|
---|
| 4577 | =item uc
|
---|
| 4578 |
|
---|
| 4579 | If C<uc> is used, the first line of the C<CSV> source will be read as the
|
---|
| 4580 | list of field headers mapped to upper case and used to produce an array of
|
---|
| 4581 | hashes. This is a variation of C<auto>.
|
---|
| 4582 |
|
---|
| 4583 | my $aoh = csv (in => $fh, headers => "uc");
|
---|
| 4584 |
|
---|
| 4585 | =item CODE
|
---|
| 4586 |
|
---|
| 4587 | If a coderef is used, the first line of the C<CSV> source will be read as
|
---|
| 4588 | the list of mangled field headers in which each field is passed as the only
|
---|
| 4589 | argument to the coderef. This list is used to produce an array of hashes.
|
---|
| 4590 |
|
---|
| 4591 | my $aoh = csv (in => $fh,
|
---|
| 4592 | headers => sub { lc ($_[0]) =~ s/kode/code/gr });
|
---|
| 4593 |
|
---|
| 4594 | this example is a variation of using C<lc> where all occurrences of C<kode>
|
---|
| 4595 | are replaced with C<code>.
|
---|
| 4596 |
|
---|
| 4597 | =item ARRAY
|
---|
| 4598 |
|
---|
| 4599 | If C<headers> is an anonymous list, the entries in the list will be used
|
---|
| 4600 | as field names. The first line is considered data instead of headers.
|
---|
| 4601 |
|
---|
| 4602 | my $aoh = csv (in => $fh, headers => [qw( Foo Bar )]);
|
---|
| 4603 | csv (in => $aoa, out => $fh, headers => [qw( code description price )]);
|
---|
| 4604 |
|
---|
| 4605 | =item HASH
|
---|
| 4606 |
|
---|
| 4607 | If C<headers> is an hash reference, this implies C<auto>, but header fields
|
---|
| 4608 | for that exist as key in the hashref will be replaced by the value for that
|
---|
| 4609 | key. Given a CSV file like
|
---|
| 4610 |
|
---|
| 4611 | post-kode,city,name,id number,fubble
|
---|
| 4612 | 1234AA,Duckstad,Donald,13,"X313DF"
|
---|
| 4613 |
|
---|
| 4614 | using
|
---|
| 4615 |
|
---|
| 4616 | csv (headers => { "post-kode" => "pc", "id number" => "ID" }, ...
|
---|
| 4617 |
|
---|
| 4618 | will return an entry like
|
---|
| 4619 |
|
---|
| 4620 | { pc => "1234AA",
|
---|
| 4621 | city => "Duckstad",
|
---|
| 4622 | name => "Donald",
|
---|
| 4623 | ID => "13",
|
---|
| 4624 | fubble => "X313DF",
|
---|
| 4625 | }
|
---|
| 4626 |
|
---|
| 4627 | =back
|
---|
| 4628 |
|
---|
| 4629 | See also L<C<munge_column_names>|/munge_column_names> and
|
---|
| 4630 | L<C<set_column_names>|/set_column_names>.
|
---|
| 4631 |
|
---|
| 4632 | =head3 munge_column_names
|
---|
| 4633 |
|
---|
| 4634 | If C<munge_column_names> is set, the method L</header> is invoked on the
|
---|
| 4635 | opened stream with all matching arguments to detect and set the headers.
|
---|
| 4636 |
|
---|
| 4637 | C<munge_column_names> can be abbreviated to C<munge>.
|
---|
| 4638 |
|
---|
| 4639 | =head3 key
|
---|
| 4640 |
|
---|
| 4641 | If passed, will default L<C<headers>|/headers> to C<"auto"> and return a
|
---|
| 4642 | hashref instead of an array of hashes. Allowed values are simple scalars or
|
---|
| 4643 | array-references where the first element is the joiner and the rest are the
|
---|
| 4644 | fields to join to combine the key.
|
---|
| 4645 |
|
---|
| 4646 | my $ref = csv (in => "test.csv", key => "code");
|
---|
| 4647 | my $ref = csv (in => "test.csv", key => [ ":" => "code", "color" ]);
|
---|
| 4648 |
|
---|
| 4649 | with test.csv like
|
---|
| 4650 |
|
---|
| 4651 | code,product,price,color
|
---|
| 4652 | 1,pc,850,gray
|
---|
| 4653 | 2,keyboard,12,white
|
---|
| 4654 | 3,mouse,5,black
|
---|
| 4655 |
|
---|
| 4656 | the first example will return
|
---|
| 4657 |
|
---|
| 4658 | { 1 => {
|
---|
| 4659 | code => 1,
|
---|
| 4660 | color => 'gray',
|
---|
| 4661 | price => 850,
|
---|
| 4662 | product => 'pc'
|
---|
| 4663 | },
|
---|
| 4664 | 2 => {
|
---|
| 4665 | code => 2,
|
---|
| 4666 | color => 'white',
|
---|
| 4667 | price => 12,
|
---|
| 4668 | product => 'keyboard'
|
---|
| 4669 | },
|
---|
| 4670 | 3 => {
|
---|
| 4671 | code => 3,
|
---|
| 4672 | color => 'black',
|
---|
| 4673 | price => 5,
|
---|
| 4674 | product => 'mouse'
|
---|
| 4675 | }
|
---|
| 4676 | }
|
---|
| 4677 |
|
---|
| 4678 | the second example will return
|
---|
| 4679 |
|
---|
| 4680 | { "1:gray" => {
|
---|
| 4681 | code => 1,
|
---|
| 4682 | color => 'gray',
|
---|
| 4683 | price => 850,
|
---|
| 4684 | product => 'pc'
|
---|
| 4685 | },
|
---|
| 4686 | "2:white" => {
|
---|
| 4687 | code => 2,
|
---|
| 4688 | color => 'white',
|
---|
| 4689 | price => 12,
|
---|
| 4690 | product => 'keyboard'
|
---|
| 4691 | },
|
---|
| 4692 | "3:black" => {
|
---|
| 4693 | code => 3,
|
---|
| 4694 | color => 'black',
|
---|
| 4695 | price => 5,
|
---|
| 4696 | product => 'mouse'
|
---|
| 4697 | }
|
---|
| 4698 | }
|
---|
| 4699 |
|
---|
| 4700 | The C<key> attribute can be combined with L<C<headers>|/headers> for C<CSV>
|
---|
| 4701 | date that has no header line, like
|
---|
| 4702 |
|
---|
| 4703 | my $ref = csv (
|
---|
| 4704 | in => "foo.csv",
|
---|
| 4705 | headers => [qw( c_foo foo bar description stock )],
|
---|
| 4706 | key => "c_foo",
|
---|
| 4707 | );
|
---|
| 4708 |
|
---|
| 4709 | =head3 value
|
---|
| 4710 |
|
---|
| 4711 | Used to create key-value hashes.
|
---|
| 4712 |
|
---|
| 4713 | Only allowed when C<key> is valid. A C<value> can be either a single column
|
---|
| 4714 | label or an anonymous list of column labels. In the first case, the value
|
---|
| 4715 | will be a simple scalar value, in the latter case, it will be a hashref.
|
---|
| 4716 |
|
---|
| 4717 | my $ref = csv (in => "test.csv", key => "code",
|
---|
| 4718 | value => "price");
|
---|
| 4719 | my $ref = csv (in => "test.csv", key => "code",
|
---|
| 4720 | value => [ "product", "price" ]);
|
---|
| 4721 | my $ref = csv (in => "test.csv", key => [ ":" => "code", "color" ],
|
---|
| 4722 | value => "price");
|
---|
| 4723 | my $ref = csv (in => "test.csv", key => [ ":" => "code", "color" ],
|
---|
| 4724 | value => [ "product", "price" ]);
|
---|
| 4725 |
|
---|
| 4726 | with test.csv like
|
---|
| 4727 |
|
---|
| 4728 | code,product,price,color
|
---|
| 4729 | 1,pc,850,gray
|
---|
| 4730 | 2,keyboard,12,white
|
---|
| 4731 | 3,mouse,5,black
|
---|
| 4732 |
|
---|
| 4733 | the first example will return
|
---|
| 4734 |
|
---|
| 4735 | { 1 => 850,
|
---|
| 4736 | 2 => 12,
|
---|
| 4737 | 3 => 5,
|
---|
| 4738 | }
|
---|
| 4739 |
|
---|
| 4740 | the second example will return
|
---|
| 4741 |
|
---|
| 4742 | { 1 => {
|
---|
| 4743 | price => 850,
|
---|
| 4744 | product => 'pc'
|
---|
| 4745 | },
|
---|
| 4746 | 2 => {
|
---|
| 4747 | price => 12,
|
---|
| 4748 | product => 'keyboard'
|
---|
| 4749 | },
|
---|
| 4750 | 3 => {
|
---|
| 4751 | price => 5,
|
---|
| 4752 | product => 'mouse'
|
---|
| 4753 | }
|
---|
| 4754 | }
|
---|
| 4755 |
|
---|
| 4756 | the third example will return
|
---|
| 4757 |
|
---|
| 4758 | { "1:gray" => 850,
|
---|
| 4759 | "2:white" => 12,
|
---|
| 4760 | "3:black" => 5,
|
---|
| 4761 | }
|
---|
| 4762 |
|
---|
| 4763 | the fourth example will return
|
---|
| 4764 |
|
---|
| 4765 | { "1:gray" => {
|
---|
| 4766 | price => 850,
|
---|
| 4767 | product => 'pc'
|
---|
| 4768 | },
|
---|
| 4769 | "2:white" => {
|
---|
| 4770 | price => 12,
|
---|
| 4771 | product => 'keyboard'
|
---|
| 4772 | },
|
---|
| 4773 | "3:black" => {
|
---|
| 4774 | price => 5,
|
---|
| 4775 | product => 'mouse'
|
---|
| 4776 | }
|
---|
| 4777 | }
|
---|
| 4778 |
|
---|
| 4779 | =head3 keep_headers
|
---|
| 4780 |
|
---|
| 4781 | When using hashes, keep the column names into the arrayref passed, so all
|
---|
| 4782 | headers are available after the call in the original order.
|
---|
| 4783 |
|
---|
| 4784 | my $aoh = csv (in => "file.csv", keep_headers => \my @hdr);
|
---|
| 4785 |
|
---|
| 4786 | This attribute can be abbreviated to C<kh> or passed as C<keep_column_names>.
|
---|
| 4787 |
|
---|
| 4788 | This attribute implies a default of C<auto> for the C<headers> attribute.
|
---|
| 4789 |
|
---|
| 4790 | =head3 fragment
|
---|
| 4791 |
|
---|
| 4792 | Only output the fragment as defined in the L</fragment> method. This option
|
---|
| 4793 | is ignored when I<generating> C<CSV>. See L</out>.
|
---|
| 4794 |
|
---|
| 4795 | Combining all of them could give something like
|
---|
| 4796 |
|
---|
| 4797 | use Text::CSV_PP qw( csv );
|
---|
| 4798 | my $aoh = csv (
|
---|
| 4799 | in => "test.txt",
|
---|
| 4800 | encoding => "utf-8",
|
---|
| 4801 | headers => "auto",
|
---|
| 4802 | sep_char => "|",
|
---|
| 4803 | fragment => "row=3;6-9;15-*",
|
---|
| 4804 | );
|
---|
| 4805 | say $aoh->[15]{Foo};
|
---|
| 4806 |
|
---|
| 4807 | =head3 sep_set
|
---|
| 4808 |
|
---|
| 4809 | If C<sep_set> is set, the method L</header> is invoked on the opened stream
|
---|
| 4810 | to detect and set L<C<sep_char>|/sep_char> with the given set.
|
---|
| 4811 |
|
---|
| 4812 | C<sep_set> can be abbreviated to C<seps>.
|
---|
| 4813 |
|
---|
| 4814 | Note that as the L</header> method is invoked, its default is to also set
|
---|
| 4815 | the headers.
|
---|
| 4816 |
|
---|
| 4817 | =head3 set_column_names
|
---|
| 4818 |
|
---|
| 4819 | If C<set_column_names> is passed, the method L</header> is invoked on the
|
---|
| 4820 | opened stream with all arguments meant for L</header>.
|
---|
| 4821 |
|
---|
| 4822 | If C<set_column_names> is passed as a false value, the content of the first
|
---|
| 4823 | row is only preserved if the output is AoA:
|
---|
| 4824 |
|
---|
| 4825 | With an input-file like
|
---|
| 4826 |
|
---|
| 4827 | bAr,foo
|
---|
| 4828 | 1,2
|
---|
| 4829 | 3,4,5
|
---|
| 4830 |
|
---|
| 4831 | This call
|
---|
| 4832 |
|
---|
| 4833 | my $aoa = csv (in => $file, set_column_names => 0);
|
---|
| 4834 |
|
---|
| 4835 | will result in
|
---|
| 4836 |
|
---|
| 4837 | [[ "bar", "foo" ],
|
---|
| 4838 | [ "1", "2" ],
|
---|
| 4839 | [ "3", "4", "5" ]]
|
---|
| 4840 |
|
---|
| 4841 | and
|
---|
| 4842 |
|
---|
| 4843 | my $aoa = csv (in => $file, set_column_names => 0, munge => "none");
|
---|
| 4844 |
|
---|
| 4845 | will result in
|
---|
| 4846 |
|
---|
| 4847 | [[ "bAr", "foo" ],
|
---|
| 4848 | [ "1", "2" ],
|
---|
| 4849 | [ "3", "4", "5" ]]
|
---|
| 4850 |
|
---|
| 4851 | =head2 Callbacks
|
---|
| 4852 |
|
---|
| 4853 | Callbacks enable actions triggered from the I<inside> of Text::CSV_PP.
|
---|
| 4854 |
|
---|
| 4855 | While most of what this enables can easily be done in an unrolled loop as
|
---|
| 4856 | described in the L</SYNOPSIS> callbacks can be used to meet special demands
|
---|
| 4857 | or enhance the L</csv> function.
|
---|
| 4858 |
|
---|
| 4859 | =over 2
|
---|
| 4860 |
|
---|
| 4861 | =item error
|
---|
| 4862 |
|
---|
| 4863 | $csv->callbacks (error => sub { $csv->SetDiag (0) });
|
---|
| 4864 |
|
---|
| 4865 | the C<error> callback is invoked when an error occurs, but I<only> when
|
---|
| 4866 | L</auto_diag> is set to a true value. A callback is invoked with the values
|
---|
| 4867 | returned by L</error_diag>:
|
---|
| 4868 |
|
---|
| 4869 | my ($c, $s);
|
---|
| 4870 |
|
---|
| 4871 | sub ignore3006
|
---|
| 4872 | {
|
---|
| 4873 | my ($err, $msg, $pos, $recno, $fldno) = @_;
|
---|
| 4874 | if ($err == 3006) {
|
---|
| 4875 | # ignore this error
|
---|
| 4876 | ($c, $s) = (undef, undef);
|
---|
| 4877 | Text::CSV_PP->SetDiag (0);
|
---|
| 4878 | }
|
---|
| 4879 | # Any other error
|
---|
| 4880 | return;
|
---|
| 4881 | } # ignore3006
|
---|
| 4882 |
|
---|
| 4883 | $csv->callbacks (error => \&ignore3006);
|
---|
| 4884 | $csv->bind_columns (\$c, \$s);
|
---|
| 4885 | while ($csv->getline ($fh)) {
|
---|
| 4886 | # Error 3006 will not stop the loop
|
---|
| 4887 | }
|
---|
| 4888 |
|
---|
| 4889 | =item after_parse
|
---|
| 4890 |
|
---|
| 4891 | $csv->callbacks (after_parse => sub { push @{$_[1]}, "NEW" });
|
---|
| 4892 | while (my $row = $csv->getline ($fh)) {
|
---|
| 4893 | $row->[-1] eq "NEW";
|
---|
| 4894 | }
|
---|
| 4895 |
|
---|
| 4896 | This callback is invoked after parsing with L</getline> only if no error
|
---|
| 4897 | occurred. The callback is invoked with two arguments: the current C<CSV>
|
---|
| 4898 | parser object and an array reference to the fields parsed.
|
---|
| 4899 |
|
---|
| 4900 | The return code of the callback is ignored unless it is a reference to the
|
---|
| 4901 | string "skip", in which case the record will be skipped in L</getline_all>.
|
---|
| 4902 |
|
---|
| 4903 | sub add_from_db
|
---|
| 4904 | {
|
---|
| 4905 | my ($csv, $row) = @_;
|
---|
| 4906 | $sth->execute ($row->[4]);
|
---|
| 4907 | push @$row, $sth->fetchrow_array;
|
---|
| 4908 | } # add_from_db
|
---|
| 4909 |
|
---|
| 4910 | my $aoa = csv (in => "file.csv", callbacks => {
|
---|
| 4911 | after_parse => \&add_from_db });
|
---|
| 4912 |
|
---|
| 4913 | This hook can be used for validation:
|
---|
| 4914 |
|
---|
| 4915 | =over 2
|
---|
| 4916 |
|
---|
| 4917 | =item FAIL
|
---|
| 4918 |
|
---|
| 4919 | Die if any of the records does not validate a rule:
|
---|
| 4920 |
|
---|
| 4921 | after_parse => sub {
|
---|
| 4922 | $_[1][4] =~ m/^[0-9]{4}\s?[A-Z]{2}$/ or
|
---|
| 4923 | die "5th field does not have a valid Dutch zipcode";
|
---|
| 4924 | }
|
---|
| 4925 |
|
---|
| 4926 | =item DEFAULT
|
---|
| 4927 |
|
---|
| 4928 | Replace invalid fields with a default value:
|
---|
| 4929 |
|
---|
| 4930 | after_parse => sub { $_[1][2] =~ m/^\d+$/ or $_[1][2] = 0 }
|
---|
| 4931 |
|
---|
| 4932 | =item SKIP
|
---|
| 4933 |
|
---|
| 4934 | Skip records that have invalid fields (only applies to L</getline_all>):
|
---|
| 4935 |
|
---|
| 4936 | after_parse => sub { $_[1][0] =~ m/^\d+$/ or return \"skip"; }
|
---|
| 4937 |
|
---|
| 4938 | =back
|
---|
| 4939 |
|
---|
| 4940 | =item before_print
|
---|
| 4941 |
|
---|
| 4942 | my $idx = 1;
|
---|
| 4943 | $csv->callbacks (before_print => sub { $_[1][0] = $idx++ });
|
---|
| 4944 | $csv->print (*STDOUT, [ 0, $_ ]) for @members;
|
---|
| 4945 |
|
---|
| 4946 | This callback is invoked before printing with L</print> only if no error
|
---|
| 4947 | occurred. The callback is invoked with two arguments: the current C<CSV>
|
---|
| 4948 | parser object and an array reference to the fields passed.
|
---|
| 4949 |
|
---|
| 4950 | The return code of the callback is ignored.
|
---|
| 4951 |
|
---|
| 4952 | sub max_4_fields
|
---|
| 4953 | {
|
---|
| 4954 | my ($csv, $row) = @_;
|
---|
| 4955 | @$row > 4 and splice @$row, 4;
|
---|
| 4956 | } # max_4_fields
|
---|
| 4957 |
|
---|
| 4958 | csv (in => csv (in => "file.csv"), out => *STDOUT,
|
---|
| 4959 | callbacks => { before print => \&max_4_fields });
|
---|
| 4960 |
|
---|
| 4961 | This callback is not active for L</combine>.
|
---|
| 4962 |
|
---|
| 4963 | =back
|
---|
| 4964 |
|
---|
| 4965 | =head3 Callbacks for csv ()
|
---|
| 4966 |
|
---|
| 4967 | The L</csv> allows for some callbacks that do not integrate in XS internals
|
---|
| 4968 | but only feature the L</csv> function.
|
---|
| 4969 |
|
---|
| 4970 | csv (in => "file.csv",
|
---|
| 4971 | callbacks => {
|
---|
| 4972 | filter => { 6 => sub { $_ > 15 } }, # first
|
---|
| 4973 | after_parse => sub { say "AFTER PARSE"; }, # first
|
---|
| 4974 | after_in => sub { say "AFTER IN"; }, # second
|
---|
| 4975 | on_in => sub { say "ON IN"; }, # third
|
---|
| 4976 | },
|
---|
| 4977 | );
|
---|
| 4978 |
|
---|
| 4979 | csv (in => $aoh,
|
---|
| 4980 | out => "file.csv",
|
---|
| 4981 | callbacks => {
|
---|
| 4982 | on_in => sub { say "ON IN"; }, # first
|
---|
| 4983 | before_out => sub { say "BEFORE OUT"; }, # second
|
---|
| 4984 | before_print => sub { say "BEFORE PRINT"; }, # third
|
---|
| 4985 | },
|
---|
| 4986 | );
|
---|
| 4987 |
|
---|
| 4988 | =over 2
|
---|
| 4989 |
|
---|
| 4990 | =item filter
|
---|
| 4991 |
|
---|
| 4992 | This callback can be used to filter records. It is called just after a new
|
---|
| 4993 | record has been scanned. The callback accepts a:
|
---|
| 4994 |
|
---|
| 4995 | =over 2
|
---|
| 4996 |
|
---|
| 4997 | =item hashref
|
---|
| 4998 |
|
---|
| 4999 | The keys are the index to the row (the field name or field number, 1-based)
|
---|
| 5000 | and the values are subs to return a true or false value.
|
---|
| 5001 |
|
---|
| 5002 | csv (in => "file.csv", filter => {
|
---|
| 5003 | 3 => sub { m/a/ }, # third field should contain an "a"
|
---|
| 5004 | 5 => sub { length > 4 }, # length of the 5th field minimal 5
|
---|
| 5005 | });
|
---|
| 5006 |
|
---|
| 5007 | csv (in => "file.csv", filter => { foo => sub { $_ > 4 }});
|
---|
| 5008 |
|
---|
| 5009 | If the keys to the filter hash contain any character that is not a digit it
|
---|
| 5010 | will also implicitly set L</headers> to C<"auto"> unless L</headers> was
|
---|
| 5011 | already passed as argument. When headers are active, returning an array of
|
---|
| 5012 | hashes, the filter is not applicable to the header itself.
|
---|
| 5013 |
|
---|
| 5014 | All sub results should match, as in AND.
|
---|
| 5015 |
|
---|
| 5016 | The context of the callback sets C<$_> localized to the field indicated by
|
---|
| 5017 | the filter. The two arguments are as with all other callbacks, so the other
|
---|
| 5018 | fields in the current row can be seen:
|
---|
| 5019 |
|
---|
| 5020 | filter => { 3 => sub { $_ > 100 ? $_[1][1] =~ m/A/ : $_[1][6] =~ m/B/ }}
|
---|
| 5021 |
|
---|
| 5022 | If the context is set to return a list of hashes (L</headers> is defined),
|
---|
| 5023 | the current record will also be available in the localized C<%_>:
|
---|
| 5024 |
|
---|
| 5025 | filter => { 3 => sub { $_ > 100 && $_{foo} =~ m/A/ && $_{bar} < 1000 }}
|
---|
| 5026 |
|
---|
| 5027 | If the filter is used to I<alter> the content by changing C<$_>, make sure
|
---|
| 5028 | that the sub returns true in order not to have that record skipped:
|
---|
| 5029 |
|
---|
| 5030 | filter => { 2 => sub { $_ = uc }}
|
---|
| 5031 |
|
---|
| 5032 | will upper-case the second field, and then skip it if the resulting content
|
---|
| 5033 | evaluates to false. To always accept, end with truth:
|
---|
| 5034 |
|
---|
| 5035 | filter => { 2 => sub { $_ = uc; 1 }}
|
---|
| 5036 |
|
---|
| 5037 | =item coderef
|
---|
| 5038 |
|
---|
| 5039 | csv (in => "file.csv", filter => sub { $n++; 0; });
|
---|
| 5040 |
|
---|
| 5041 | If the argument to C<filter> is a coderef, it is an alias or shortcut to a
|
---|
| 5042 | filter on column 0:
|
---|
| 5043 |
|
---|
| 5044 | csv (filter => sub { $n++; 0 });
|
---|
| 5045 |
|
---|
| 5046 | is equal to
|
---|
| 5047 |
|
---|
| 5048 | csv (filter => { 0 => sub { $n++; 0 });
|
---|
| 5049 |
|
---|
| 5050 | =item filter-name
|
---|
| 5051 |
|
---|
| 5052 | csv (in => "file.csv", filter => "not_blank");
|
---|
| 5053 | csv (in => "file.csv", filter => "not_empty");
|
---|
| 5054 | csv (in => "file.csv", filter => "filled");
|
---|
| 5055 |
|
---|
| 5056 | These are predefined filters
|
---|
| 5057 |
|
---|
| 5058 | Given a file like (line numbers prefixed for doc purpose only):
|
---|
| 5059 |
|
---|
| 5060 | 1:1,2,3
|
---|
| 5061 | 2:
|
---|
| 5062 | 3:,
|
---|
| 5063 | 4:""
|
---|
| 5064 | 5:,,
|
---|
| 5065 | 6:, ,
|
---|
| 5066 | 7:"",
|
---|
| 5067 | 8:" "
|
---|
| 5068 | 9:4,5,6
|
---|
| 5069 |
|
---|
| 5070 | =over 2
|
---|
| 5071 |
|
---|
| 5072 | =item not_blank
|
---|
| 5073 |
|
---|
| 5074 | Filter out the blank lines
|
---|
| 5075 |
|
---|
| 5076 | This filter is a shortcut for
|
---|
| 5077 |
|
---|
| 5078 | filter => { 0 => sub { @{$_[1]} > 1 or
|
---|
| 5079 | defined $_[1][0] && $_[1][0] ne "" } }
|
---|
| 5080 |
|
---|
| 5081 | Due to the implementation, it is currently impossible to also filter lines
|
---|
| 5082 | that consists only of a quoted empty field. These lines are also considered
|
---|
| 5083 | blank lines.
|
---|
| 5084 |
|
---|
| 5085 | With the given example, lines 2 and 4 will be skipped.
|
---|
| 5086 |
|
---|
| 5087 | =item not_empty
|
---|
| 5088 |
|
---|
| 5089 | Filter out lines where all the fields are empty.
|
---|
| 5090 |
|
---|
| 5091 | This filter is a shortcut for
|
---|
| 5092 |
|
---|
| 5093 | filter => { 0 => sub { grep { defined && $_ ne "" } @{$_[1]} } }
|
---|
| 5094 |
|
---|
| 5095 | A space is not regarded being empty, so given the example data, lines 2, 3,
|
---|
| 5096 | 4, 5, and 7 are skipped.
|
---|
| 5097 |
|
---|
| 5098 | =item filled
|
---|
| 5099 |
|
---|
| 5100 | Filter out lines that have no visible data
|
---|
| 5101 |
|
---|
| 5102 | This filter is a shortcut for
|
---|
| 5103 |
|
---|
| 5104 | filter => { 0 => sub { grep { defined && m/\S/ } @{$_[1]} } }
|
---|
| 5105 |
|
---|
| 5106 | This filter rejects all lines that I<not> have at least one field that does
|
---|
| 5107 | not evaluate to the empty string.
|
---|
| 5108 |
|
---|
| 5109 | With the given example data, this filter would skip lines 2 through 8.
|
---|
| 5110 |
|
---|
| 5111 | =back
|
---|
| 5112 |
|
---|
| 5113 | =back
|
---|
| 5114 |
|
---|
| 5115 | =item after_in
|
---|
| 5116 |
|
---|
| 5117 | This callback is invoked for each record after all records have been parsed
|
---|
| 5118 | but before returning the reference to the caller. The hook is invoked with
|
---|
| 5119 | two arguments: the current C<CSV> parser object and a reference to the
|
---|
| 5120 | record. The reference can be a reference to a HASH or a reference to an
|
---|
| 5121 | ARRAY as determined by the arguments.
|
---|
| 5122 |
|
---|
| 5123 | This callback can also be passed as an attribute without the C<callbacks>
|
---|
| 5124 | wrapper.
|
---|
| 5125 |
|
---|
| 5126 | =item before_out
|
---|
| 5127 |
|
---|
| 5128 | This callback is invoked for each record before the record is printed. The
|
---|
| 5129 | hook is invoked with two arguments: the current C<CSV> parser object and a
|
---|
| 5130 | reference to the record. The reference can be a reference to a HASH or a
|
---|
| 5131 | reference to an ARRAY as determined by the arguments.
|
---|
| 5132 |
|
---|
| 5133 | This callback can also be passed as an attribute without the C<callbacks>
|
---|
| 5134 | wrapper.
|
---|
| 5135 |
|
---|
| 5136 | This callback makes the row available in C<%_> if the row is a hashref. In
|
---|
| 5137 | this case C<%_> is writable and will change the original row.
|
---|
| 5138 |
|
---|
| 5139 | =item on_in
|
---|
| 5140 |
|
---|
| 5141 | This callback acts exactly as the L</after_in> or the L</before_out> hooks.
|
---|
| 5142 |
|
---|
| 5143 | This callback can also be passed as an attribute without the C<callbacks>
|
---|
| 5144 | wrapper.
|
---|
| 5145 |
|
---|
| 5146 | This callback makes the row available in C<%_> if the row is a hashref. In
|
---|
| 5147 | this case C<%_> is writable and will change the original row. So e.g. with
|
---|
| 5148 |
|
---|
| 5149 | my $aoh = csv (
|
---|
| 5150 | in => \"foo\n1\n2\n",
|
---|
| 5151 | headers => "auto",
|
---|
| 5152 | on_in => sub { $_{bar} = 2; },
|
---|
| 5153 | );
|
---|
| 5154 |
|
---|
| 5155 | C<$aoh> will be:
|
---|
| 5156 |
|
---|
| 5157 | [ { foo => 1,
|
---|
| 5158 | bar => 2,
|
---|
| 5159 | }
|
---|
| 5160 | { foo => 2,
|
---|
| 5161 | bar => 2,
|
---|
| 5162 | }
|
---|
| 5163 | ]
|
---|
| 5164 |
|
---|
| 5165 | =item csv
|
---|
| 5166 |
|
---|
| 5167 | The I<function> L</csv> can also be called as a method or with an existing
|
---|
| 5168 | Text::CSV_PP object. This could help if the function is to be invoked a lot
|
---|
| 5169 | of times and the overhead of creating the object internally over and over
|
---|
| 5170 | again would be prevented by passing an existing instance.
|
---|
| 5171 |
|
---|
| 5172 | my $csv = Text::CSV_PP->new ({ binary => 1, auto_diag => 1 });
|
---|
| 5173 |
|
---|
| 5174 | my $aoa = $csv->csv (in => $fh);
|
---|
| 5175 | my $aoa = csv (in => $fh, csv => $csv);
|
---|
| 5176 |
|
---|
| 5177 | both act the same. Running this 20000 times on a 20 lines CSV file, showed
|
---|
| 5178 | a 53% speedup.
|
---|
| 5179 |
|
---|
| 5180 | =back
|
---|
| 5181 |
|
---|
| 5182 | =head1 DIAGNOSTICS
|
---|
| 5183 |
|
---|
| 5184 | This section is also taken from Text::CSV_XS.
|
---|
| 5185 |
|
---|
| 5186 | Still under construction ...
|
---|
| 5187 |
|
---|
| 5188 | If an error occurs, C<< $csv->error_diag >> can be used to get information
|
---|
| 5189 | on the cause of the failure. Note that for speed reasons the internal value
|
---|
| 5190 | is never cleared on success, so using the value returned by L</error_diag>
|
---|
| 5191 | in normal cases - when no error occurred - may cause unexpected results.
|
---|
| 5192 |
|
---|
| 5193 | If the constructor failed, the cause can be found using L</error_diag> as a
|
---|
| 5194 | class method, like C<< Text::CSV_PP->error_diag >>.
|
---|
| 5195 |
|
---|
| 5196 | The C<< $csv->error_diag >> method is automatically invoked upon error when
|
---|
| 5197 | the contractor was called with L<C<auto_diag>|/auto_diag> set to C<1> or
|
---|
| 5198 | C<2>, or when L<autodie> is in effect. When set to C<1>, this will cause a
|
---|
| 5199 | C<warn> with the error message, when set to C<2>, it will C<die>. C<2012 -
|
---|
| 5200 | EOF> is excluded from L<C<auto_diag>|/auto_diag> reports.
|
---|
| 5201 |
|
---|
| 5202 | Errors can be (individually) caught using the L</error> callback.
|
---|
| 5203 |
|
---|
| 5204 | The errors as described below are available. I have tried to make the error
|
---|
| 5205 | itself explanatory enough, but more descriptions will be added. For most of
|
---|
| 5206 | these errors, the first three capitals describe the error category:
|
---|
| 5207 |
|
---|
| 5208 | =over 2
|
---|
| 5209 |
|
---|
| 5210 | =item *
|
---|
| 5211 | INI
|
---|
| 5212 |
|
---|
| 5213 | Initialization error or option conflict.
|
---|
| 5214 |
|
---|
| 5215 | =item *
|
---|
| 5216 | ECR
|
---|
| 5217 |
|
---|
| 5218 | Carriage-Return related parse error.
|
---|
| 5219 |
|
---|
| 5220 | =item *
|
---|
| 5221 | EOF
|
---|
| 5222 |
|
---|
| 5223 | End-Of-File related parse error.
|
---|
| 5224 |
|
---|
| 5225 | =item *
|
---|
| 5226 | EIQ
|
---|
| 5227 |
|
---|
| 5228 | Parse error inside quotation.
|
---|
| 5229 |
|
---|
| 5230 | =item *
|
---|
| 5231 | EIF
|
---|
| 5232 |
|
---|
| 5233 | Parse error inside field.
|
---|
| 5234 |
|
---|
| 5235 | =item *
|
---|
| 5236 | ECB
|
---|
| 5237 |
|
---|
| 5238 | Combine error.
|
---|
| 5239 |
|
---|
| 5240 | =item *
|
---|
| 5241 | EHR
|
---|
| 5242 |
|
---|
| 5243 | HashRef parse related error.
|
---|
| 5244 |
|
---|
| 5245 | =back
|
---|
| 5246 |
|
---|
| 5247 | And below should be the complete list of error codes that can be returned:
|
---|
| 5248 |
|
---|
| 5249 | =over 2
|
---|
| 5250 |
|
---|
| 5251 | =item *
|
---|
| 5252 | 1001 "INI - sep_char is equal to quote_char or escape_char"
|
---|
| 5253 |
|
---|
| 5254 | The L<separation character|/sep_char> cannot be equal to L<the quotation
|
---|
| 5255 | character|/quote_char> or to L<the escape character|/escape_char>, as this
|
---|
| 5256 | would invalidate all parsing rules.
|
---|
| 5257 |
|
---|
| 5258 | =item *
|
---|
| 5259 | 1002 "INI - allow_whitespace with escape_char or quote_char SP or TAB"
|
---|
| 5260 |
|
---|
| 5261 | Using the L<C<allow_whitespace>|/allow_whitespace> attribute when either
|
---|
| 5262 | L<C<quote_char>|/quote_char> or L<C<escape_char>|/escape_char> is equal to
|
---|
| 5263 | C<SPACE> or C<TAB> is too ambiguous to allow.
|
---|
| 5264 |
|
---|
| 5265 | =item *
|
---|
| 5266 | 1003 "INI - \r or \n in main attr not allowed"
|
---|
| 5267 |
|
---|
| 5268 | Using default L<C<eol>|/eol> characters in either L<C<sep_char>|/sep_char>,
|
---|
| 5269 | L<C<quote_char>|/quote_char>, or L<C<escape_char>|/escape_char> is not
|
---|
| 5270 | allowed.
|
---|
| 5271 |
|
---|
| 5272 | =item *
|
---|
| 5273 | 1004 "INI - callbacks should be undef or a hashref"
|
---|
| 5274 |
|
---|
| 5275 | The L<C<callbacks>|/Callbacks> attribute only allows one to be C<undef> or
|
---|
| 5276 | a hash reference.
|
---|
| 5277 |
|
---|
| 5278 | =item *
|
---|
| 5279 | 1005 "INI - EOL too long"
|
---|
| 5280 |
|
---|
| 5281 | The value passed for EOL is exceeding its maximum length (16).
|
---|
| 5282 |
|
---|
| 5283 | =item *
|
---|
| 5284 | 1006 "INI - SEP too long"
|
---|
| 5285 |
|
---|
| 5286 | The value passed for SEP is exceeding its maximum length (16).
|
---|
| 5287 |
|
---|
| 5288 | =item *
|
---|
| 5289 | 1007 "INI - QUOTE too long"
|
---|
| 5290 |
|
---|
| 5291 | The value passed for QUOTE is exceeding its maximum length (16).
|
---|
| 5292 |
|
---|
| 5293 | =item *
|
---|
| 5294 | 1008 "INI - SEP undefined"
|
---|
| 5295 |
|
---|
| 5296 | The value passed for SEP should be defined and not empty.
|
---|
| 5297 |
|
---|
| 5298 | =item *
|
---|
| 5299 | 1010 "INI - the header is empty"
|
---|
| 5300 |
|
---|
| 5301 | The header line parsed in the L</header> is empty.
|
---|
| 5302 |
|
---|
| 5303 | =item *
|
---|
| 5304 | 1011 "INI - the header contains more than one valid separator"
|
---|
| 5305 |
|
---|
| 5306 | The header line parsed in the L</header> contains more than one (unique)
|
---|
| 5307 | separator character out of the allowed set of separators.
|
---|
| 5308 |
|
---|
| 5309 | =item *
|
---|
| 5310 | 1012 "INI - the header contains an empty field"
|
---|
| 5311 |
|
---|
| 5312 | The header line parsed in the L</header> is contains an empty field.
|
---|
| 5313 |
|
---|
| 5314 | =item *
|
---|
| 5315 | 1013 "INI - the header contains nun-unique fields"
|
---|
| 5316 |
|
---|
| 5317 | The header line parsed in the L</header> contains at least two identical
|
---|
| 5318 | fields.
|
---|
| 5319 |
|
---|
| 5320 | =item *
|
---|
| 5321 | 1014 "INI - header called on undefined stream"
|
---|
| 5322 |
|
---|
| 5323 | The header line cannot be parsed from an undefined sources.
|
---|
| 5324 |
|
---|
| 5325 | =item *
|
---|
| 5326 | 1500 "PRM - Invalid/unsupported argument(s)"
|
---|
| 5327 |
|
---|
| 5328 | Function or method called with invalid argument(s) or parameter(s).
|
---|
| 5329 |
|
---|
| 5330 | =item *
|
---|
| 5331 | 1501 "PRM - The key attribute is passed as an unsupported type"
|
---|
| 5332 |
|
---|
| 5333 | The C<key> attribute is of an unsupported type.
|
---|
| 5334 |
|
---|
| 5335 | =item *
|
---|
| 5336 | 1502 "PRM - The value attribute is passed without the key attribute"
|
---|
| 5337 |
|
---|
| 5338 | The C<value> attribute is only allowed when a valid key is given.
|
---|
| 5339 |
|
---|
| 5340 | =item *
|
---|
| 5341 | 1503 "PRM - The value attribute is passed as an unsupported type"
|
---|
| 5342 |
|
---|
| 5343 | The C<value> attribute is of an unsupported type.
|
---|
| 5344 |
|
---|
| 5345 | =item *
|
---|
| 5346 | 2010 "ECR - QUO char inside quotes followed by CR not part of EOL"
|
---|
| 5347 |
|
---|
| 5348 | When L<C<eol>|/eol> has been set to anything but the default, like
|
---|
| 5349 | C<"\r\t\n">, and the C<"\r"> is following the B<second> (closing)
|
---|
| 5350 | L<C<quote_char>|/quote_char>, where the characters following the C<"\r"> do
|
---|
| 5351 | not make up the L<C<eol>|/eol> sequence, this is an error.
|
---|
| 5352 |
|
---|
| 5353 | =item *
|
---|
| 5354 | 2011 "ECR - Characters after end of quoted field"
|
---|
| 5355 |
|
---|
| 5356 | Sequences like C<1,foo,"bar"baz,22,1> are not allowed. C<"bar"> is a quoted
|
---|
| 5357 | field and after the closing double-quote, there should be either a new-line
|
---|
| 5358 | sequence or a separation character.
|
---|
| 5359 |
|
---|
| 5360 | =item *
|
---|
| 5361 | 2012 "EOF - End of data in parsing input stream"
|
---|
| 5362 |
|
---|
| 5363 | Self-explaining. End-of-file while inside parsing a stream. Can happen only
|
---|
| 5364 | when reading from streams with L</getline>, as using L</parse> is done on
|
---|
| 5365 | strings that are not required to have a trailing L<C<eol>|/eol>.
|
---|
| 5366 |
|
---|
| 5367 | =item *
|
---|
| 5368 | 2013 "INI - Specification error for fragments RFC7111"
|
---|
| 5369 |
|
---|
| 5370 | Invalid specification for URI L</fragment> specification.
|
---|
| 5371 |
|
---|
| 5372 | =item *
|
---|
| 5373 | 2014 "ENF - Inconsistent number of fields"
|
---|
| 5374 |
|
---|
| 5375 | Inconsistent number of fields under strict parsing.
|
---|
| 5376 |
|
---|
| 5377 | =item *
|
---|
| 5378 | 2021 "EIQ - NL char inside quotes, binary off"
|
---|
| 5379 |
|
---|
| 5380 | Sequences like C<1,"foo\nbar",22,1> are allowed only when the binary option
|
---|
| 5381 | has been selected with the constructor.
|
---|
| 5382 |
|
---|
| 5383 | =item *
|
---|
| 5384 | 2022 "EIQ - CR char inside quotes, binary off"
|
---|
| 5385 |
|
---|
| 5386 | Sequences like C<1,"foo\rbar",22,1> are allowed only when the binary option
|
---|
| 5387 | has been selected with the constructor.
|
---|
| 5388 |
|
---|
| 5389 | =item *
|
---|
| 5390 | 2023 "EIQ - QUO character not allowed"
|
---|
| 5391 |
|
---|
| 5392 | Sequences like C<"foo "bar" baz",qu> and C<2023,",2008-04-05,"Foo, Bar",\n>
|
---|
| 5393 | will cause this error.
|
---|
| 5394 |
|
---|
| 5395 | =item *
|
---|
| 5396 | 2024 "EIQ - EOF cannot be escaped, not even inside quotes"
|
---|
| 5397 |
|
---|
| 5398 | The escape character is not allowed as last character in an input stream.
|
---|
| 5399 |
|
---|
| 5400 | =item *
|
---|
| 5401 | 2025 "EIQ - Loose unescaped escape"
|
---|
| 5402 |
|
---|
| 5403 | An escape character should escape only characters that need escaping.
|
---|
| 5404 |
|
---|
| 5405 | Allowing the escape for other characters is possible with the attribute
|
---|
| 5406 | L</allow_loose_escape>.
|
---|
| 5407 |
|
---|
| 5408 | =item *
|
---|
| 5409 | 2026 "EIQ - Binary character inside quoted field, binary off"
|
---|
| 5410 |
|
---|
| 5411 | Binary characters are not allowed by default. Exceptions are fields that
|
---|
| 5412 | contain valid UTF-8, that will automatically be upgraded if the content is
|
---|
| 5413 | valid UTF-8. Set L<C<binary>|/binary> to C<1> to accept binary data.
|
---|
| 5414 |
|
---|
| 5415 | =item *
|
---|
| 5416 | 2027 "EIQ - Quoted field not terminated"
|
---|
| 5417 |
|
---|
| 5418 | When parsing a field that started with a quotation character, the field is
|
---|
| 5419 | expected to be closed with a quotation character. When the parsed line is
|
---|
| 5420 | exhausted before the quote is found, that field is not terminated.
|
---|
| 5421 |
|
---|
| 5422 | =item *
|
---|
| 5423 | 2030 "EIF - NL char inside unquoted verbatim, binary off"
|
---|
| 5424 |
|
---|
| 5425 | =item *
|
---|
| 5426 | 2031 "EIF - CR char is first char of field, not part of EOL"
|
---|
| 5427 |
|
---|
| 5428 | =item *
|
---|
| 5429 | 2032 "EIF - CR char inside unquoted, not part of EOL"
|
---|
| 5430 |
|
---|
| 5431 | =item *
|
---|
| 5432 | 2034 "EIF - Loose unescaped quote"
|
---|
| 5433 |
|
---|
| 5434 | =item *
|
---|
| 5435 | 2035 "EIF - Escaped EOF in unquoted field"
|
---|
| 5436 |
|
---|
| 5437 | =item *
|
---|
| 5438 | 2036 "EIF - ESC error"
|
---|
| 5439 |
|
---|
| 5440 | =item *
|
---|
| 5441 | 2037 "EIF - Binary character in unquoted field, binary off"
|
---|
| 5442 |
|
---|
| 5443 | =item *
|
---|
| 5444 | 2110 "ECB - Binary character in Combine, binary off"
|
---|
| 5445 |
|
---|
| 5446 | =item *
|
---|
| 5447 | 2200 "EIO - print to IO failed. See errno"
|
---|
| 5448 |
|
---|
| 5449 | =item *
|
---|
| 5450 | 3001 "EHR - Unsupported syntax for column_names ()"
|
---|
| 5451 |
|
---|
| 5452 | =item *
|
---|
| 5453 | 3002 "EHR - getline_hr () called before column_names ()"
|
---|
| 5454 |
|
---|
| 5455 | =item *
|
---|
| 5456 | 3003 "EHR - bind_columns () and column_names () fields count mismatch"
|
---|
| 5457 |
|
---|
| 5458 | =item *
|
---|
| 5459 | 3004 "EHR - bind_columns () only accepts refs to scalars"
|
---|
| 5460 |
|
---|
| 5461 | =item *
|
---|
| 5462 | 3006 "EHR - bind_columns () did not pass enough refs for parsed fields"
|
---|
| 5463 |
|
---|
| 5464 | =item *
|
---|
| 5465 | 3007 "EHR - bind_columns needs refs to writable scalars"
|
---|
| 5466 |
|
---|
| 5467 | =item *
|
---|
| 5468 | 3008 "EHR - unexpected error in bound fields"
|
---|
| 5469 |
|
---|
| 5470 | =item *
|
---|
| 5471 | 3009 "EHR - print_hr () called before column_names ()"
|
---|
| 5472 |
|
---|
| 5473 | =item *
|
---|
| 5474 | 3010 "EHR - print_hr () called with invalid arguments"
|
---|
| 5475 |
|
---|
| 5476 | =back
|
---|
| 5477 |
|
---|
| 5478 | =head1 SEE ALSO
|
---|
| 5479 |
|
---|
| 5480 | L<Text::CSV_XS>, L<Text::CSV>
|
---|
| 5481 |
|
---|
| 5482 | Older versions took many regexp from L<http://www.din.or.jp/~ohzaki/perl.htm>
|
---|
| 5483 |
|
---|
| 5484 | =head1 AUTHOR
|
---|
| 5485 |
|
---|
| 5486 | Kenichi Ishigaki, E<lt>ishigaki[at]cpan.orgE<gt>
|
---|
| 5487 | Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
|
---|
| 5488 |
|
---|
| 5489 | Text::CSV_XS was written by E<lt>joe[at]ispsoft.deE<gt>
|
---|
| 5490 | and maintained by E<lt>h.m.brand[at]xs4all.nlE<gt>.
|
---|
| 5491 |
|
---|
| 5492 | Text::CSV was written by E<lt>alan[at]mfgrtl.comE<gt>.
|
---|
| 5493 |
|
---|
| 5494 | =head1 COPYRIGHT AND LICENSE
|
---|
| 5495 |
|
---|
| 5496 | Copyright 2017- by Kenichi Ishigaki, E<lt>ishigaki[at]cpan.orgE<gt>
|
---|
| 5497 | Copyright 2005-2015 by Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
|
---|
| 5498 |
|
---|
| 5499 | Most of the code and doc is directly taken from the pure perl part of
|
---|
| 5500 | Text::CSV_XS.
|
---|
| 5501 |
|
---|
| 5502 | Copyright (C) 2007-2016 H.Merijn Brand. All rights reserved.
|
---|
| 5503 | Copyright (C) 1998-2001 Jochen Wiedmann. All rights reserved.
|
---|
| 5504 | Copyright (C) 1997 Alan Citterman. All rights reserved.
|
---|
| 5505 |
|
---|
| 5506 | This library is free software; you can redistribute it and/or modify
|
---|
| 5507 | it under the same terms as Perl itself.
|
---|
| 5508 |
|
---|
| 5509 | =cut
|
---|