///////////////////////////////////////////////////////////////////////////
/*
  Copyright 2001 Ronald S. Burkey

  This file is part of GutenMark.

  GutenMark is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  GutenMark is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with GutenMark; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  Filename:	NIMA.c
  Purpose:	This program is used to process non-US geographical
  		database files from NIMA to produce GutenMark
		compatible wordlists.  Only placenames consisting of a
		single word are retained by this program.  Because the
		NIMA databases contain many characters that are not
		contained in our 8-bit character set, we transparently
		convert these characters by removing diacriticals.
  Mods:		12/22/01 RSB	Began.
  		12/23/01 RSB	Allowed words containing unsupported
				characters, question marks, or numerals
				to be deleted.  Also, treat () and (()) 
				and <> as removable delimiters.  Reject
				words that begin with a hyphen, or contain
				double-quotes or end-parans within their 
				bodies.
  
  Note that the NIMA databases theoretically support a much larger 
  character set than can be fit into the 8-bit HTML 4.0 characters, so
  a lot of the weirder characters cannot be supported, or else are supported
  only by automatic removal of diacriticals.
*/
///////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <string.h>
#include <ctype.h>

// The NIMA data uses six different geographical regions, each with its
// own separate character set, as defined in regions.pdf.  These are mostly 
// similar to our 8-bit character set, but are not identical.  We have 
// to define all characters with numerical values from 32 to 255.  Where
// it just doesn't matter, or is irreproducible, we use a value of 0.
unsigned char RegionChars[224][6] = {
  {' ', ' ', ' ', ' ', ' ', ' '},	// #32
  {'!', '!', '!', '!', '!', 'O'},	// #33
  {'\"', '\"', '\"', '\"', '\"', '\"'},	// #34
  {'#', '#', '#', '#', '#', 'u'},	// #35
  {'$', '$', '$', '$', '$', 'U'},	// #36
  {'%', '%', '%', '%', '%', '%'},	// #37
  {'&', '&', '&', '&', '&', 'o'},	// #38
  {'\'', '\'', '\'', '\'', '\'', '\''},	// #39
  {'(', '(', '(', '(', '(', '('},	// #40
  {')', ')', ')', ')', ')', ')'},	// #41
  {'*', '*', '*', '*', '*', '*'},	// #42
  {'+', '+', '+', '+', '+', 'O'},	// #43
  {',', ',', ',', ',', ',', ','},	// #44
  {'-', '-', '-', '-', '-', '-'},	// #45
  {'.', '.', '.', '.', '.', '.'},	// #46
  {'/', '/', '/', '/', '/', '/'},	// #47
  {'0', '0', '0', '0', '0', '0'},	// #48
  {'1', '1', '1', '1', '1', '1'},	// #49
  {'2', '2', '2', '2', '2', '2'},	// #50
  {'3', '3', '3', '3', '3', '3'},	// #51
  {'4', '4', '4', '4', '4', '4'},	// #52
  {'5', '5', '5', '5', '5', '5'},	// #53
  {'6', '6', '6', '6', '6', '6'},	// #54
  {'7', '7', '7', '7', '7', '7'},	// #55
  {'8', '8', '8', '8', '8', '8'},	// #56
  {'9', '9', '9', '9', '9', '9'},	// #57
  {':', ':', ':', ':', ':', ':'},	// #58
  {';', ';', ';', ';', ';', ';'},	// #59
  {'<', '<', '<', '<', '<', 'O'},	// #60
  {'=', '=', '=', '=', '=', 'U'},	// #61
  {'>', '>', '>', '>', '>', 'u'},	// #62
  {'?', '?', '?', '?', '?', 'o'},	// #63
  {'@', '@', '@', '@', '@', 'O'},	// #64
  // We won't bother with A-Z, since these are the same for
  // all regions as the normal ASCII characters.
  {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0},
  {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0},
  {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0},
  {'[', '[', '[', '[', '[', '['},	// #91
  {'\\', '\\', '\\', '\\', '\\', 'o'},	// #92
  {']', ']', ']', ']', ']', ']'},	// #93
  {'^', '^', '^', '^', '^', 'U'},	// #94
  {'_', '_', '_', '_', '_', '_'},	// #95
  {'`', '`', '`', '`', '`', 'o'},	// #96
  // We won't bother with a-z, since these are the same for
  // all regions as the normal ASCII characters.
  {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0},
  {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0},
  {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0},
  {'{', '{', '{', '{', '{', 'o'},	// #123
  {'|', '|', '|', '|', '|', 'U'},	// #124
  {'}', '}', '}', '}', '}', 'u'},	// #125
  {'~', '~', '~', '~', '~', 'u'},	// #126
  {0},				// #127
  {0, 'C', 'C', 'R', 0, 194},	// #128
  {'C', 'C', 'D', 'D', 'D', 'A'},	// #129
  {0, 'D', 'D', 'D', 0, 'A'},	// #130  
  {0, 'A', 'D', 'D', 0, 'A'},	// #131
  {0, 'E', 'E', 'E', 'E', 'A'},	// #132
  {0, 'E', 'G', 'E', 'E', 'A'},	// #133
  {0, 'E', 'S', 'S', 'S', 'A'},	// #134
  {0},				// #135
  {0, 'S', 0, 0, 'S', 'A'},	// #136
  {0, 'I', 'I', 'I', 0, 'A'},	// #137
  {0, 'L', 0, 0, 'P', 'E'},	// #138
  {'N', 'K', 'G', 'G', 'E', 'E'},	// #139
  {0, 'U', 'H', 'H', 'H', 'E'},	// #140
  {0, 'E', 0, 'I', 'I', 'E'},	// #141
  {'O', 'O', 0, 0, 0, 'E'},	// #142
  {0, 'L', '`', '`', 0, 'E'},	// #143
  {0, 0, 0, 0, 0, 'O'},		// #144
  {0, 'R', 'H', 'H', 'H', 'E'},	// #145
  {0, 'N', 'H', 0, 0, 'U'},	// #146
  {0, 'N', 0, 'N', 'N', 'U'},	// #147
  {0, 'R', 'H', 0, 'N', 'U'},	// #148
  {'S', 'S', 'S', 'S', 'S', 'U'},	// #149
  {0, 'S', 'S', 'S', 'S', 'Y'},	// #150
  {'\"', '\"', '\"', '\"', '\"', 'Y'},	// #151
  {0, 'L', 'W', 'T', 'M', 'I'},	// #152
  {0, 'T', 'T', 'T', 'T', 'I'},	// #153
  {0, 'T', 'T', 'T', 'T', 'O'},	// #154
  {'T', 'G', 'Z', 'Z', 'Z', 'O'},	// #155
  {'Z', 'Z', 'Z', 'Z', 'Z', 'O'},	// #156
  {0, 'Z', 'Z', 0, 0, 'O'},	// #157
  {0, 'Z', 'Z', 'Z', 'Z', 'O'},	// #158
  {0, 'I', 'I', 'I', 'I', 'I'},	// #159
  {0, 'c', 'c', 'r', 0, 'a'},	// #160
  {'c', 'c', 'd', 'd', 'd', 'a'},	// #161
  {0, 'd', 'd', 'd', 0, 'a'},	// #162
  {0, 'a', 'd', 'd', 0, 'a'},	// #163
  {0, 'e', 'e', 'e', 'e', 'a'},	// #164
  {0, 'e', 'g', 'e', 'e', 'a'},	// #165
  {0, 'e', 's', 's', 's', 'a'},	// #166
  {0},				// #167
  {0, 's', 0, 0, 's', 'a'},	// #168
  {0, 'i', 'i', 'i', 0, 'a'},	// #169
  {0, 'l', 0, 0, 'p', 'e'},	// #170
  {'n', 'k', 'g', 'g', 'e', 'e'},	// #171
  {0, 'u', 'h', 'h', 'h', 'e'},	// #172
  {0, 'e', 0, 'i', 'i', 'e'},	// #173
  {'o', 'o', 0, 0, 0, 'e'},	// #174
  {0, 'l', '`', '`', '`', 'e'},	// #175
  {0},				// #176
  {0, 'r', 'h', 'h', 'h', 'e'},	// #177
  {0, 'n', 'h', 0, 0, 'u'},	// #178
  {0, 'n', 0, 'n', 'n', 'u'},	// #179
  {0, 'r', 'h', 0, 'n', 'u'},	// #180
  {'s', 's', 's', 's', 's', 'u'},	// #181
  {0, 's', 's', 's', 's', 'y'},	// #182
  {0, 0, 0, 0, 0, 'y'},		// #183
  {0, 'l', 'w', 't', 'm', 'i'},	// #184
  {0, 't', 't', 't', 't', 'i'},	// #185
  {0, 't', 't', 't', 't', 'o'},	// #186
  {'t', 'g', 'z', 'z', 'z', 'o'},	// #187
  {'z', 'z', 'z', 'z', 'z', 'o'},	// #188
  {0, 'z', 'z', 0, 0, 'o'},	// #189
  {0, 'z', 'z', 'z', 'z', 'o'},	// #190
  {0, 'i', 'i', 'i', 'i', 'i'},	// #191
  // The characters from 192-255 are treated a little differently.
  // They are assumed to be equal to their numeric indices (like
  // a-z and A-Z are) EXCEPT where a non-zero value appears in the
  // table below.
  {0, 0, 0, 0, 0, 0},		// #192
  {0, 0, 0, 0, 0, 0},		// #193
  {0, 0, 0, 0, 0, 0},		// #194
  {0, 'L', 0, 0, 0, 0},		// #195
  {0, 0, 0, 0, 0, 'A'},		// #196
  {0, 'A', 'A', 'A', 'A', 'A'},	// #197
  {0, 'A', 0, 0, 'A', 'A'},	// #198
  {0, 0, 0, 0, 0, 0},		// #199
  {0, 0, 0, 0, 0, 0},		// #200
  {0, 0, 0, 0, 0, 0},		// #201
  {0, 'I', 0, 0, 0, 0},		// #202
  {0, 0, 0, 0, 0, 'A'},		// #203
  {0, 0, 0, 0, 0, 0},		// #204
  {0, 0, 0, 0, 0, 0},		// #205
  {0, 0, 0, 0, 0, 0},		// #206
  {0, 0, 0, 0, 0, 'a'},		// #207
  {0, 0, 'N', 'O', 'O', 0},	// #208
  {0, 'N', 'N', 0, 0, 'E'},	// #209
  {0, 0, 0, 0, 0, 0},		// #210
  {0, 0, 0, 0, 0, 0},		// #211
  {0, 0, 0, 0, 0, 0},		// #212
  {0, 0, 0, 0, 0, 0},		// #213
  {0, 0, 0, 0, 0, 'O'},		// #214
  {'U', 0, 0, 'O', 'O', 'O'},	// #215
  {0, 'U', 'U', 'U', 'U', 'U'},	// #216
  {0, 0, 0, 0, 0, 0},		// #217
  {0, 0, 0, 0, 0, 0},		// #218
  {0, 'U', 0, 0, 0, 0},		// #219
  {0, 0, 0, 0, 0, 'O'},		// #220
  {0, 0, 0, 0, 0, 0},		// #221
  {0, 'U', 0, 'U', 'U', 'Y'},	// #222
  {0, 0, 0, 0, 0, 'Y'},		// #223
  {0, 0, 0, 0, 0, 0},		// #224
  {0, 0, 0, 0, 0, 0},		// #225
  {0, 0, 0, 0, 0, 0},		// #226
  {0, 'l', 0, 0, 0, 0},		// #227
  {0, 0, 0, 0, 0, 'a'},		// #228
  {0, 'a', 'a', 'a', 'a', 'a'},	// #229
  {0, 'a', 0, 0, 'a', 'a'},	// #230
  {0, 0, 0, 0, 0, 0},		// #231
  {0, 0, 0, 0, 0, 0},		// #232
  {0, 0, 0, 0, 0, 0},		// #233
  {0, 'i', 0, 0, 0, 0},		// #234
  {0, 0, 0, 0, 0, 0},		// #235
  {0, 0, 0, 0, 0, 0},		// #236
  {0, 0, 0, 0, 0, 0},		// #237
  {0, 0, 0, 0, 0, 0},		// #238
  {0, 0, 0, 0, 0, 0},		// #239
  {0, 0, 'n', 'o', 'o', 0},	// #240
  {0, 'n', 0, 0, 0, 'e'},	// #241
  {0, 0, 0, 0, 0, 0},		// #242
  {0, 0, 0, 0, 0, 0},		// #243
  {0, 0, 0, 0, 0, 0},		// #244
  {0, 0, 0, 0, 0, 'o'},		// #245
  {0, 0, 0, 0, 0, 'o'},		// #246
  {'u', 0, 0, 'o', 'o', 'o'},	// #247
  {0, 'u', 'u', 'u', 'u', 'u'},	// #248
  {0, 0, 0, 0, 0, 0},		// #249
  {0, 0, 0, 0, 0, 0},		// #250
  {0, 'u', 0, 0, 0, 0},		// #251
  {0, 0, 0, 0, 0, 'o'},		// #252
  {0, 0, 0, 0, 0, 0},		// #253
  {0, 'u', 0, 'u', 'u', 'y'},	// #254
  {240, 0, 0, 0, 0, 'y'}	// #255
};

char s[1000], *ss, *sss;

int
main (void)
{
  int i, j, k, RegionCode;

  // Simplify things a little by filling in some voids we've left
  // in the character tables.
  for (i = ' '; i <= 255; i++)
    {
      j = i - ' ';
      for (k = 0; k < 6; k++)
	{
	  if ((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z'))
	    RegionChars[j][k] = i;
	  else if (i >= 192 && !RegionChars[j][k])
	    RegionChars[j][k] = i;
	}
    }

  // Now analyze the file itself.  Each line consists of many tab-delimited
  // fields, of which we're interested only in field #1 (region code, a
  // value from 1-6) and #24 (the feature name).
  while (NULL != fgets (s, sizeof (s), stdin))
    {
      if (1 != sscanf (s, "%d", &RegionCode))
	continue;
      if (RegionCode < 1 || RegionCode > 6)
	continue;
      RegionCode--;
      // Search out the name field.
      for (sss = s, i = 0; *sss && i < 23; sss++)
	if (*sss == '\t')
	  i++;
      if (!*sss)
	continue;
      ss = sss;
      if (*ss == '-')
	continue;
      for (; *sss; sss++)
	{
	  if (isspace (*sss))
	    break;
	  i = *sss;
	  *sss = RegionChars[i - ' '][RegionCode];
	  // Found an unsupported character?
	  if (!*sss || isdigit (*sss) || *sss == '?')
	    goto Next;
	  if ((*sss == '\"' || *sss == ')') && ss[1] != 0)
	    goto Next;
	}
      if (*sss != '\t' || sss == ss)
	continue;
      *sss = 0;
      if (*ss == '\"' && sss[-1] == '\"')
	{
	  sss[-1] = 0;
	  printf ("%s\n", ss + 1);
	}
      if (*ss == '<' && sss[-1] == '>')
	{
	  sss[-1] = 0;
	  printf ("%s\n", ss + 1);
	}
      else if (*ss == '(' && sss[-1] == ')')
	{
	  if (ss[1] == '(' && sss[-2] == ')')
	    {
	      sss[-2] = 0;
	      printf ("%s\n", ss + 2);
	    }
	  else
	    {
	      sss[-1] = 0;
	      printf ("%s\n", ss + 1);
	    }
	}
      else
	printf ("%s\n", ss);
    Next:;
    }

  return (0);
}
