Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

check best performance after stable CSV algorithms #1

Open
ppKrauss opened this issue Mar 21, 2017 · 0 comments
Open

check best performance after stable CSV algorithms #1

ppKrauss opened this issue Mar 21, 2017 · 0 comments

Comments

@ppKrauss
Copy link
Contributor

ppKrauss commented Mar 21, 2017

Small and medium files

CREATE FUNCTION lib.get_csvfile(
  file text,
  delim_char char(1) = ',',
  quote_char char(1) = '"'
) RETURNS setof text[]  AS $f$
  import csv
  return csv.reader(
     open(file, 'rb'), 
     quotechar=quote_char, 
     delimiter=delim_char, 
     skipinitialspace=True, 
     escapechar='\\'
  )
$f$ immutable language PLpythonU;

CREATE FUNCTION lib.get_csvline(
    _line text,
    _delim_char char(1) = ',',  
    _quote_char char(1) = '"'
) RETURNS text[] AS $f$
    import csv
    return [row for row in csv.reader(
        [_line], 
        quotechar = _quote_char, 
        delimiter = _delim_char, 
        skipinitialspace = True, 
        escapechar = '\\'
    )][0]
$f$ immutable language PLpythonU;

big CSV files

For big CSV files, best is to use EXTENSION file_fdw. Complex but works fine, and need only one command after implemented.

    CREATE SCHEMA csv; -- a library, easy to drop 

    CREATE FUNCTION csv.split_csv2(
      line text,                  -- the input CSV UTF-8 line
      delim_char char(1) = ',',   -- can be also E'\t', ';', '|', etc.
      quote_char char(1) = '"'   -- field quotation
    ) RETURNS setof text[] AS $f$
      import csv
      return csv.reader(
          line.splitlines(), 
          quotechar=quote_char, 
          delimiter=delim_char, 
          skipinitialspace=True, 
          escapechar='\\'
      )
    $f$ IMMUTABLE language PLpythonU;

    CREATE FUNCTION csv.split_csv_line2(
      text, char(1) DEFAULT ',', char(1) DEFAULT '"'
    ) RETURNS text[] AS $f$
      SELECT x FROM csv.split_csv($1,$2,$3) x LIMIT 1;
    $f$ language SQL IMMUTABLE;

   -- not reliable but fast? http://stackoverflow.com/a/42926265/287948
   -- compare performance with csv.split_csv_line2()
   -- ideal a C implementation
    CREATE OR REPLACE FUNCTION csv.split_csv_line(
       line text,                 -- the input CSV string
       delim_char char(1) = ',',  -- can be also E'\t', ';', '|', etc.
       quote_char char(1) = '"',  -- field quotation
       OUT result text[]
    ) LANGUAGE plpgsql AS
    $f$DECLARE
       i integer;
       t text := '';
       c char(1);
       /*
        * 0 means unquoted
        * 1 means quoted
        * 2 means quoted, and we just read a quote
        */
       q integer := 0;
    BEGIN
       /* loop through the characters */
       FOR i IN 1..length(line) LOOP
          /* get i-th character */
          c := substring(line FROM i FOR 1);
          /* end of string is at an unquoted delimiter */
          IF c = delim_char AND q <> 1 THEN
             result := result || t;
             t := '';
             q := 0;
          ELSIF c = quote_char THEN
             CASE q
                WHEN 0 THEN
                   q := 1;
                WHEN 1 THEN
                   IF c = quote_char THEN
                      q := 2;
                   ELSE
                      t := t || c;
                   END IF;
                WHEN 2 THEN
                   q := 1;
                   t := t || quote_char;
             END CASE;
          ELSE
             IF q = 2 THEN
                q := 0;
             END IF;
             t := t || c;
          END IF;
       END LOOP;
       /* add the last string */
       result := result || t;
    END;$f$;

    CREATE EXTENSION file_fdw WITH SCHEMA csv;
    CREATE SERVER files FOREIGN DATA WRAPPER file_fdw;

    CREATE or replace FUNCTION csv.ftable_drop(tname text) RETURNS void AS $f$
      BEGIN
      EXECUTE format('DROP VIEW IF EXISTS %s', 'csv.'||$1);
      EXECUTE format('DROP FOREIGN TABLE IF EXISTS %s CASCADE', 'csv.fgn_'||$1);
      END
    $f$ language PLpgSQL;
    
    CREATE or replace FUNCTION csv.ftable_create(
         tname text,  -- table name for scan text and "csv_tname"  
         fpath text,  -- file path to the input CSV
         delim_char char(1) = ',',   -- can be also E'\t', ';', '|', etc.
         quote_char char(1) = '"',   -- field quotation
         p_check boolean default true  -- false to not check-and-drop.
    ) RETURNS text AS $f$
      DECLARE
        fgn_tname text;
        csv_tname text;
      BEGIN
      fgn_tname := 'csv.fgn_'||$1;
      csv_tname := 'csv.'||$1;
      IF p_check THEN PERFORM csv.ftable_drop($1); END IF;
      EXECUTE format(E'CREATE FOREIGN TABLE %s (line text) SERVER files OPTIONS ( filename %L, format \'text\')', fgn_tname, $2);
      EXECUTE format('CREATE VIEW %s AS SELECT csv.split_csv_line(line,%L,%L) as c FROM %s', csv_tname, delim_char,quote_char, fgn_tname);
      RETURN format(E'\ntables %I and %I created\n',fgn_tname,csv_tname);
      END
    $f$ language PLpgSQL;

    ---- TESTING ----
    SELECT csv.ftable_create('my','/tmp/TSE2/lix.csv',';');

@ppKrauss ppKrauss changed the title check best performance after stable algorithms check best performance after stable CSV algorithms Mar 21, 2017
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant