check best performance after stable CSV algorithms #1

ppKrauss · 2017-03-21T13:28:41Z

Small and medium files

CREATE FUNCTION lib.get_csvfile(
  file text,
  delim_char char(1) = ',',
  quote_char char(1) = '"'
) RETURNS setof text[]  AS $f$
  import csv
  return csv.reader(
     open(file, 'rb'), 
     quotechar=quote_char, 
     delimiter=delim_char, 
     skipinitialspace=True, 
     escapechar='\\'
  )
$f$ immutable language PLpythonU;

CREATE FUNCTION lib.get_csvline(
    _line text,
    _delim_char char(1) = ',',  
    _quote_char char(1) = '"'
) RETURNS text[] AS $f$
    import csv
    return [row for row in csv.reader(
        [_line], 
        quotechar = _quote_char, 
        delimiter = _delim_char, 
        skipinitialspace = True, 
        escapechar = '\\'
    )][0]
$f$ immutable language PLpythonU;

big CSV files

For big CSV files, best is to use EXTENSION file_fdw. Complex but works fine, and need only one command after implemented.

    CREATE SCHEMA csv; -- a library, easy to drop 

    CREATE FUNCTION csv.split_csv2(
      line text,                  -- the input CSV UTF-8 line
      delim_char char(1) = ',',   -- can be also E'\t', ';', '|', etc.
      quote_char char(1) = '"'   -- field quotation
    ) RETURNS setof text[] AS $f$
      import csv
      return csv.reader(
          line.splitlines(), 
          quotechar=quote_char, 
          delimiter=delim_char, 
          skipinitialspace=True, 
          escapechar='\\'
      )
    $f$ IMMUTABLE language PLpythonU;

    CREATE FUNCTION csv.split_csv_line2(
      text, char(1) DEFAULT ',', char(1) DEFAULT '"'
    ) RETURNS text[] AS $f$
      SELECT x FROM csv.split_csv($1,$2,$3) x LIMIT 1;
    $f$ language SQL IMMUTABLE;

   -- not reliable but fast? http://stackoverflow.com/a/42926265/287948
   -- compare performance with csv.split_csv_line2()
   -- ideal a C implementation
    CREATE OR REPLACE FUNCTION csv.split_csv_line(
       line text,                 -- the input CSV string
       delim_char char(1) = ',',  -- can be also E'\t', ';', '|', etc.
       quote_char char(1) = '"',  -- field quotation
       OUT result text[]
    ) LANGUAGE plpgsql AS
    $f$DECLARE
       i integer;
       t text := '';
       c char(1);
       /*
        * 0 means unquoted
        * 1 means quoted
        * 2 means quoted, and we just read a quote
        */
       q integer := 0;
    BEGIN
       /* loop through the characters */
       FOR i IN 1..length(line) LOOP
          /* get i-th character */
          c := substring(line FROM i FOR 1);
          /* end of string is at an unquoted delimiter */
          IF c = delim_char AND q <> 1 THEN
             result := result || t;
             t := '';
             q := 0;
          ELSIF c = quote_char THEN
             CASE q
                WHEN 0 THEN
                   q := 1;
                WHEN 1 THEN
                   IF c = quote_char THEN
                      q := 2;
                   ELSE
                      t := t || c;
                   END IF;
                WHEN 2 THEN
                   q := 1;
                   t := t || quote_char;
             END CASE;
          ELSE
             IF q = 2 THEN
                q := 0;
             END IF;
             t := t || c;
          END IF;
       END LOOP;
       /* add the last string */
       result := result || t;
    END;$f$;

    CREATE EXTENSION file_fdw WITH SCHEMA csv;
    CREATE SERVER files FOREIGN DATA WRAPPER file_fdw;

    CREATE or replace FUNCTION csv.ftable_drop(tname text) RETURNS void AS $f$
      BEGIN
      EXECUTE format('DROP VIEW IF EXISTS %s', 'csv.'||$1);
      EXECUTE format('DROP FOREIGN TABLE IF EXISTS %s CASCADE', 'csv.fgn_'||$1);
      END
    $f$ language PLpgSQL;
    
    CREATE or replace FUNCTION csv.ftable_create(
         tname text,  -- table name for scan text and "csv_tname"  
         fpath text,  -- file path to the input CSV
         delim_char char(1) = ',',   -- can be also E'\t', ';', '|', etc.
         quote_char char(1) = '"',   -- field quotation
         p_check boolean default true  -- false to not check-and-drop.
    ) RETURNS text AS $f$
      DECLARE
        fgn_tname text;
        csv_tname text;
      BEGIN
      fgn_tname := 'csv.fgn_'||$1;
      csv_tname := 'csv.'||$1;
      IF p_check THEN PERFORM csv.ftable_drop($1); END IF;
      EXECUTE format(E'CREATE FOREIGN TABLE %s (line text) SERVER files OPTIONS ( filename %L, format \'text\')', fgn_tname, $2);
      EXECUTE format('CREATE VIEW %s AS SELECT csv.split_csv_line(line,%L,%L) as c FROM %s', csv_tname, delim_char,quote_char, fgn_tname);
      RETURN format(E'\ntables %I and %I created\n',fgn_tname,csv_tname);
      END
    $f$ language PLpgSQL;

    ---- TESTING ----
    SELECT csv.ftable_create('my','/tmp/TSE2/lix.csv',';');

generates schema from a CSV for a PostgreSQL Copy

The text was updated successfully, but these errors were encountered:

ppKrauss changed the title ~~check best performance after stable algorithms~~ check best performance after stable CSV algorithms Mar 21, 2017

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

check best performance after stable CSV algorithms #1

check best performance after stable CSV algorithms #1

ppKrauss commented Mar 21, 2017 •

edited

Loading

check best performance after stable CSV algorithms #1

check best performance after stable CSV algorithms #1

Comments

ppKrauss commented Mar 21, 2017 • edited Loading

Small and medium files

big CSV files

ppKrauss commented Mar 21, 2017 •

edited

Loading