Megatest

Check-in [81649543cf]
Login
Overview
Comment:Extracting sheets to txtdb now works
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | dev
Files: files | file ages | folders
SHA1: 81649543cf84f528b5dcab1b27d6c492aa29b44e
User & Date: matt on 2013-07-14 02:19:22
Other Links: branch diff | manifest | tags
Context
2013-07-14
02:23
Extracting sheets to txtdb now works check-in: 9cacf1f935 user: matt tags: dev
02:19
Extracting sheets to txtdb now works check-in: 81649543cf user: matt tags: dev
00:45
Added snippets to txtdb that disentangle bits n pieces of xml from gnumeric file check-in: b8bf0ef6c4 user: matt tags: dev
Changes

Modified txtdb/txtdb.scm from [9e703b192e] to [8f826c3024].

1
2
3
4
5
6
7
8
9
10
11
12
13


14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37



























38

39
40
41
42
43


44
45
46
47
48
49
50





51









52




















53
54



55







56
57
58
59
60
61
62
63
64

;; Copyright 2006-2012, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.

(use ssax)
(use sxml-serializer)
(use sxml-modifications)



;; Read a non-compressed gnumeric file
(define (txtdb:read-gnumeric-xml fname)
  (with-input-from-file fname
    (lambda ()
      (ssax:xml->sxml (current-input-port) '()))))


(define (find-section dat section #!key (depth 0))
  (let loop ((hed   (car dat))
	     (tal   (cdr dat)))   
    (if (list? hed)
	(let ((res (find-section hed section depth: (+ depth 1))))
	  (if res 
	      res
	      (if (null? tal)
		  #f
		  (loop (car tal)(cdr tal)))))
	(if (eq? hed section)
	    tal
	    (if (null? tal)
		#f
		(loop (car tal)(cdr tal)))))))




























(define (sheet->txtdb dat)

  (let ((at-info     (find-section dat '@)) ;; misc info about the sheet
	(sheet-name  (find-section dat 'http://www.gnumeric.org/v10.dtd:Name))
	(cells       (find-section dat 'http://www.gnumeric.org/v10.dtd:Cells)))
    (print "sheet-name: " sheet-name)
    (print "Rownum\tColnum\tType\tValue")


    (for-each (lambda (cell)
		(let ((rownum  (string->number (car (find-section cell 'Row))))
		      (colnum  (string->number (car (find-section cell 'Col))))
		      (valtype (let ((res (find-section cell 'ValueType)))
				 (if res (car res) #f)))
		      (value   (let ((res (cdr (filter (lambda (x)(not (list? x))) cell))))
				 (if (null? res) "" (car res)))))





		  (print rownum "\t" colnum "\t" valtype "\t" value)))









	      cells)))





















(define (extract-txtdb dat)



  (let ((sheets (find-section dat 'http://www.gnumeric.org/v10.dtd:Sheets)))







    (for-each (lambda (sheet)
		(sheet->txtdb sheet))
	      sheets)))

#|  
 (define x (txtdb:read-gnumeric-xml "testdata-stripped.xml"))
















>
>






<

















>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
|
|
|
<
|
>
>







>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
>
>
>
|
>
>
>
>
>
>
>

|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

;; Copyright 2006-2012, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.

(use ssax)
(use sxml-serializer)
(use sxml-modifications)
(use regex)
(use srfi-69)

;; Read a non-compressed gnumeric file
(define (txtdb:read-gnumeric-xml fname)
  (with-input-from-file fname
    (lambda ()
      (ssax:xml->sxml (current-input-port) '()))))


(define (find-section dat section #!key (depth 0))
  (let loop ((hed   (car dat))
	     (tal   (cdr dat)))   
    (if (list? hed)
	(let ((res (find-section hed section depth: (+ depth 1))))
	  (if res 
	      res
	      (if (null? tal)
		  #f
		  (loop (car tal)(cdr tal)))))
	(if (eq? hed section)
	    tal
	    (if (null? tal)
		#f
		(loop (car tal)(cdr tal)))))))

(define (remove-section dat section)
  (if (null? dat)
      '()
      (let loop ((hed (car dat))
		 (tal (cdr dat))
		 (res '()))
	(let ((newres (if (and (list? hed)
			       (not (null? hed))
			       (equal? (car hed) section))
			  res
			  (cons hed res))))
	  (if (null? tal)
	      (reverse newres)
	      (loop (car tal)(cdr tal) newres))))))

(define (list-sections dat)
  (filter (lambda (x)(and x))
	  (map (lambda (section)
		 (if (and (list? section)
			  (not (null? section)))
		     (car section)
		     #f))
	       dat)))

(define (string->safe-filename str)
  (string-substitute (regexp " ") "_" str #t))

(define (sheet->txtdb dat targdir)
  (let ((sheet-name  (string->safe-filename (car (find-section dat 'http://www.gnumeric.org/v10.dtd:Name))))
	(cells       (find-section dat 'http://www.gnumeric.org/v10.dtd:Cells))
	(remaining   (remove-section (remove-section dat 'http://www.gnumeric.org/v10.dtd:Name)
				     'http://www.gnumeric.org/v10.dtd:Cells))

	(rownums     (make-hash-table))  ;; num -> name
	(colnums     (make-hash-table))  ;; num -> name
	(cols        (make-hash-table))) ;; name -> ( (name val) ... )
    (for-each (lambda (cell)
		(let ((rownum  (string->number (car (find-section cell 'Row))))
		      (colnum  (string->number (car (find-section cell 'Col))))
		      (valtype (let ((res (find-section cell 'ValueType)))
				 (if res (car res) #f)))
		      (value   (let ((res (cdr (filter (lambda (x)(not (list? x))) cell))))
				 (if (null? res) "" (car res)))))
		  ;; If colnum is 0 Then this is a row name, if rownum is 0 then this is a col name
		  (cond
		   ((eq? 0 colnum) ;; a blank in column zero is handled with the special name "row-N"
		    (hash-table-set! rownums rownum (if (equal? value "")
							(conc "row-" rownum)
							value)))
		   ((eq? 0 rownum)
		    (hash-table-set! colnums colnum (if (equal? value "")
							(conc "col-" colnum)
							value)))
		   (else
		    (let ((colname (hash-table-ref/default colnums colnum (conc "col-" colnum)))
			  (rowname (hash-table-ref/default rownums rownum (conc "row-" rownum))))
		      (hash-table-set! cols colname (cons (list rowname value) 
							  (hash-table-ref/default cols colname '()))))))))
	      cells)
    (let ((ref-colnums (map (lambda (c)
			      (list (cdr c)(car c)))
			    (hash-table->alist colnums))))
      (with-output-to-file (conc targdir "/" sheet-name ".dat")
	(lambda ()
	  (for-each (lambda (colname)
		      (print "[" colname "]")
		      (for-each (lambda (row)
				  (print (car row) " " (cadr row)))
				(reverse (hash-table-ref cols colname)))
		      (print))
		    (sort (hash-table-keys cols)(lambda (a b)
						  (let ((colnum-a (assoc a ref-colnums))
							(colnum-b (assoc b ref-colnums)))
						    (if (and colnum-a colnum-b)
							(< (cadr colnum-a)(cadr colnum-b))
							(if (and (string? a)
								 (string? b))
							    (string< a b))))))))))))


(define (extract-txtdb dat targdir)
  (let* ((wrkbk   (find-section   dat   'http://www.gnumeric.org/v10.dtd:Workbook))
	 (wrk-rem (remove-section dat   'http://www.gnumeric.org/v10.dtd:Workbook))
	 (sheets  (find-section   wrkbk 'http://www.gnumeric.org/v10.dtd:Sheets))
	 (sht-rem (remove-section wrkbk 'http://www.gnumeric.org/v10.dtd:Sheets)))
    (create-directory targdir)
    (with-output-to-file (conc targdir "/workbook.sxml")
      (lambda ()
	(pp wrk-rem)))
    (with-output-to-file (conc targdir "/sheets.sxml")
      (lambda ()
	(pp sht-rem)))
    (for-each (lambda (sheet)
		(sheet->txtdb sheet targdir))
	      sheets)))

#|  
 (define x (txtdb:read-gnumeric-xml "testdata-stripped.xml"))