I needed a function to analyse a file for delimiters and line endings prior to importing the file into MySQL using LOAD DATA LOCAL INFILE
I wrote this function to do the job, the results are (mostly) very accurate and it works nicely with large files too.
<?php
function analyse_file($file, $capture_limit_in_kb = 10) {
$output['peak_mem']['start'] = memory_get_peak_usage(true);
$output['read_kb'] = $capture_limit_in_kb;
$fh = fopen($file, 'r');
$contents = fread($fh, ($capture_limit_in_kb * 1024)); fclose($fh);
$delimiters = array(
'comma' => ',',
'semicolon' => ';',
'tab' => "\t",
'pipe' => '|',
'colon' => ':'
);
$line_endings = array(
'rn' => "\r\n",
'n' => "\n",
'r' => "\r",
'nr' => "\n\r"
);
foreach ($line_endings as $key => $value) {
$line_result[$key] = substr_count($contents, $value);
}
asort($line_result);
$output['line_ending']['results'] = $line_result;
$output['line_ending']['count'] = end($line_result);
$output['line_ending']['key'] = key($line_result);
$output['line_ending']['value'] = $line_endings[$output['line_ending']['key']];
$lines = explode($output['line_ending']['value'], $contents);
array_pop($lines);
$complete_lines = implode(' ', $lines);
$output['lines']['count'] = count($lines);
$output['lines']['length'] = strlen($complete_lines);
foreach ($delimiters as $delimiter_key => $delimiter) {
$delimiter_result[$delimiter_key] = substr_count($complete_lines, $delimiter);
}
asort($delimiter_result);
$output['delimiter']['results'] = $delimiter_result;
$output['delimiter']['count'] = end($delimiter_result);
$output['delimiter']['key'] = key($delimiter_result);
$output['delimiter']['value'] = $delimiters[$output['delimiter']['key']];
$output['peak_mem']['end'] = memory_get_peak_usage(true);
return $output;
}
?>
Example Usage:
<?php
$Array = analyse_file('/www/files/file.csv', 10);
?>
Full function output:
Array
(
[peak_mem] => Array
(
[start] => 786432
[end] => 786432
)
[line_ending] => Array
(
[results] => Array
(
[nr] => 0
[r] => 4
[n] => 4
[rn] => 4
)
[count] => 4
[key] => rn
[value] =>
)
[lines] => Array
(
[count] => 4
[length] => 94
)
[delimiter] => Array
(
[results] => Array
(
[colon] => 0
[semicolon] => 0
[pipe] => 0
[tab] => 1
[comma] => 17
)
[count] => 17
[key] => comma
[value] => ,
)
[read_kb] => 10
)
Enjoy!
Ashley